Merge pull request #664 from taka-no-me/ocl

Move OpenCL SURF to nonfree module
This commit is contained in:
Andrey Kamaev 2013-03-21 09:01:59 -07:00
commit b6365699ee
135 changed files with 3042 additions and 3175 deletions

View File

@ -141,9 +141,9 @@ OCV_OPTION(WITH_V4L "Include Video 4 Linux support" ON
OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON IF WIN32 ) OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON IF WIN32 )
OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) ) OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) )
OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) )
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" ON IF (NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" ON IF (NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON IF (NOT ANDROID AND NOT IOS) )
# OpenCV build components # OpenCV build components
@ -412,15 +412,6 @@ endif()
# --- OpenCL --- # --- OpenCL ---
if(WITH_OPENCL) if(WITH_OPENCL)
include(cmake/OpenCVDetectOpenCL.cmake) include(cmake/OpenCVDetectOpenCL.cmake)
if(OPENCL_FOUND)
set(HAVE_OPENCL 1)
endif()
if(WITH_OPENCLAMDFFT AND CLAMDFFT_INCLUDE_DIR)
set(HAVE_CLAMDFFT 1)
endif()
if(WITH_OPENCLAMDBLAS AND CLAMDBLAS_INCLUDE_DIR)
set(HAVE_CLAMDBLAS 1)
endif()
endif() endif()
# ---------------------------------------------------------------------------- # ----------------------------------------------------------------------------
@ -791,17 +782,17 @@ if(HAVE_CUDA)
status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO) status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
endif() endif()
if(HAVE_OPENCL AND BUILD_opencv_ocl) if(HAVE_OPENCL)
status("") status("")
status(" OpenCL") status(" OpenCL")
if(OPENCL_INCLUDE_DIR) if(OPENCL_INCLUDE_DIR)
status(" Include:" ${OPENCL_INCLUDE_DIR}) status(" Include path:" ${OPENCL_INCLUDE_DIRS})
endif() endif()
if(OPENCL_LIBRARIES) if(OPENCL_LIBRARIES)
status(" libraries:" ${OPENCL_LIBRARIES}) status(" libraries:" ${OPENCL_LIBRARIES})
endif() endif()
status(" Use AMDFFT:" HAVE_CLAMDFFT THEN YES ELSE NO) status(" Use AMD FFT:" HAVE_CLAMDFFT THEN YES ELSE NO)
status(" Use AMDBLAS:" HAVE_CLAMDBLAS THEN YES ELSE NO) status(" Use AMD BLAS:" HAVE_CLAMDBLAS THEN YES ELSE NO)
endif() endif()
# ========================== python ========================== # ========================== python ==========================

View File

@ -1,154 +1,104 @@
if(APPLE) if(APPLE)
set(OPENCL_FOUND YES) set(OPENCL_FOUND YES)
set(OPENCL_LIBRARIES "-framework OpenCL") set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library")
else() set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
else(APPLE)
find_package(OpenCL QUIET) find_package(OpenCL QUIET)
if(WITH_OPENCLAMDFFT)
set(CLAMDFFT_SEARCH_PATH $ENV{CLAMDFFT_PATH})
if(NOT CLAMDFFT_SEARCH_PATH)
if(WIN32)
set( CLAMDFFT_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdFft" )
endif()
endif()
set( CLAMDFFT_INCLUDE_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}/include )
if(UNIX)
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib)
else()
set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib64)
endif()
else()
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib32\\import)
else()
set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib64\\import)
endif()
endif()
find_path(CLAMDFFT_INCLUDE_DIR
NAMES clAmdFft.h
PATHS ${CLAMDFFT_INCLUDE_SEARCH_PATH}
PATH_SUFFIXES clAmdFft
NO_DEFAULT_PATH)
find_library(CLAMDFFT_LIBRARY
NAMES clAmdFft.Runtime
PATHS ${CLAMDFFT_LIB_SEARCH_PATH}
NO_DEFAULT_PATH)
if(CLAMDFFT_LIBRARY)
set(CLAMDFFT_LIBRARIES ${CLAMDFFT_LIBRARY})
else()
set(CLAMDFFT_LIBRARIES "")
endif()
endif()
if(WITH_OPENCLAMDBLAS)
set(CLAMDBLAS_SEARCH_PATH $ENV{CLAMDBLAS_PATH})
if(NOT CLAMDBLAS_SEARCH_PATH)
if(WIN32)
set( CLAMDBLAS_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdBlas" )
endif()
endif()
set( CLAMDBLAS_INCLUDE_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}/include )
if(UNIX)
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib)
else()
set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib64)
endif()
else()
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib32\\import)
else()
set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib64\\import)
endif()
endif()
find_path(CLAMDBLAS_INCLUDE_DIR
NAMES clAmdBlas.h
PATHS ${CLAMDBLAS_INCLUDE_SEARCH_PATH}
PATH_SUFFIXES clAmdBlas
NO_DEFAULT_PATH)
find_library(CLAMDBLAS_LIBRARY
NAMES clAmdBlas
PATHS ${CLAMDBLAS_LIB_SEARCH_PATH}
NO_DEFAULT_PATH)
if(CLAMDBLAS_LIBRARY)
set(CLAMDBLAS_LIBRARIES ${CLAMDBLAS_LIBRARY})
else()
set(CLAMDBLAS_LIBRARIES "")
endif()
endif()
# Try AMD/ATI Stream SDK
if (NOT OPENCL_FOUND) if (NOT OPENCL_FOUND)
set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT}) find_path(OPENCL_ROOT_DIR
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT}) NAMES OpenCL/cl.h CL/cl.h include/CL/cl.h include/nvidia-current/CL/cl.h
set(ENV_OPENCLROOT $ENV{OPENCLROOT}) PATHS ENV OCLROOT ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT
set(ENV_CUDA_PATH $ENV{CUDA_PATH}) DOC "OpenCL root directory"
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT}) NO_DEFAULT_PATH)
if(ENV_AMDSTREAMSDKROOT)
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDAPPSDKROOT}/include) find_path(OPENCL_INCLUDE_DIR
if(CMAKE_SIZEOF_VOID_P EQUAL 4) NAMES OpenCL/cl.h CL/cl.h
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86) HINTS ${OPENCL_ROOT_DIR}
else() PATH_SUFFIXES include include/nvidia-current
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86_64) DOC "OpenCL include directory")
endif()
elseif(ENV_AMDSTREAMSDKROOT) if (X86_64)
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include) set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
if(CMAKE_SIZEOF_VOID_P EQUAL 4) elseif (X86)
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86) set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86)
else()
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64)
endif()
elseif(ENV_CUDA_PATH AND WIN32)
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include)
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32)
else()
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64)
endif()
elseif(ENV_OPENCLROOT AND UNIX)
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc)
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib)
else()
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64)
endif()
elseif(ENV_INTELOCLSDKROOT)
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_INTELOCLSDKROOT}/include)
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x86)
else()
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x64)
endif()
endif() endif()
if(OPENCL_INCLUDE_SEARCH_PATH) find_library(OPENCL_LIBRARY
find_path(OPENCL_INCLUDE_DIR NAMES OpenCL
NAMES CL/cl.h OpenCL/cl.h HINTS ${OPENCL_ROOT_DIR}
PATHS ${OPENCL_INCLUDE_SEARCH_PATH} PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES}
NO_DEFAULT_PATH) DOC "OpenCL library")
else()
find_path(OPENCL_INCLUDE_DIR
NAMES CL/cl.h OpenCL/cl.h)
endif()
if(OPENCL_LIB_SEARCH_PATH)
find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH)
else()
find_library(OPENCL_LIBRARY NAMES OpenCL)
endif()
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
include(FindPackageHandleStandardArgs) include(FindPackageHandleStandardArgs)
find_package_handle_standard_args( FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR )
OPENCL endif()
DEFAULT_MSG endif(APPLE)
OPENCL_LIBRARY OPENCL_INCLUDE_DIR
)
if(OPENCL_FOUND) if(OPENCL_FOUND)
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) set(HAVE_OPENCL 1)
set(HAVE_OPENCL 1) set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
else() set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
set(OPENCL_LIBRARIES)
if (X86_64)
set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
elseif (X86)
set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
endif()
if(WITH_OPENCLAMDFFT)
find_path(CLAMDFFT_ROOT_DIR
NAMES include/clAmdFft.h
PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
PATH_SUFFIXES clAmdFft AMD/clAmdFft
DOC "AMD FFT root directory"
NO_DEFAULT_PATH)
find_path(CLAMDFFT_INCLUDE_DIR
NAMES clAmdFft.h
HINTS ${CLAMDFFT_ROOT_DIR}
PATH_SUFFIXES include
DOC "clAmdFft include directory")
find_library(CLAMDFFT_LIBRARY
NAMES clAmdFft.Runtime
HINTS ${CLAMDFFT_ROOT_DIR}
PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES}
DOC "clAmdFft library")
if(CLAMDFFT_LIBRARY AND CLAMDFFT_INCLUDE_DIR)
set(HAVE_CLAMDFFT 1)
list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDFFT_INCLUDE_DIR}")
list(APPEND OPENCL_LIBRARIES "${CLAMDFFT_LIBRARY}")
endif()
endif()
if(WITH_OPENCLAMDBLAS)
find_path(CLAMDBLAS_ROOT_DIR
NAMES include/clAmdBlas.h
PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
PATH_SUFFIXES clAmdBlas AMD/clAmdBlas
DOC "AMD FFT root directory"
NO_DEFAULT_PATH)
find_path(CLAMDBLAS_INCLUDE_DIR
NAMES clAmdBlas.h
HINTS ${CLAMDBLAS_ROOT_DIR}
PATH_SUFFIXES include
DOC "clAmdFft include directory")
find_library(CLAMDBLAS_LIBRARY
NAMES clAmdBlas
HINTS ${CLAMDBLAS_ROOT_DIR}
PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES}
DOC "clAmdBlas library")
if(CLAMDBLAS_LIBRARY AND CLAMDBLAS_INCLUDE_DIR)
set(HAVE_CLAMDBLAS 1)
list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDBLAS_INCLUDE_DIR}")
list(APPEND OPENCL_LIBRARIES "${CLAMDBLAS_LIBRARY}")
endif() endif()
else()
set(HAVE_OPENCL 1)
endif() endif()
endif() endif()

View File

@ -432,10 +432,22 @@ macro(ocv_glob_module_sources)
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
file(GLOB cl_kernels "src/opencl/*.cl")
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Include" FILES ${lib_hdrs}) source_group("Include" FILES ${lib_hdrs})
source_group("Include\\detail" FILES ${lib_hdrs_detail}) source_group("Include\\detail" FILES ${lib_hdrs_detail})
if(HAVE_OPENCL AND cl_kernels)
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
endif()
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs}) ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs})
endmacro() endmacro()
@ -449,6 +461,9 @@ macro(ocv_create_module)
if(NOT "${ARGN}" STREQUAL "SKIP_LINK") if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN}) target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
if(HAVE_OPENCL AND OPENCL_LIBRARIES)
target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
endif()
endif() endif()
add_dependencies(opencv_modules ${the_module}) add_dependencies(opencv_modules ${the_module})

View File

@ -3,7 +3,7 @@ if(BUILD_ANDROID_PACKAGE)
endif() endif()
set(the_description "Functionality with possible limitations on the use") set(the_description "Functionality with possible limitations on the use")
ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu) ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu opencv_ocl)
ocv_module_include_directories() ocv_module_include_directories()
if(HAVE_CUDA AND HAVE_opencv_gpu) if(HAVE_CUDA AND HAVE_opencv_gpu)

View File

@ -129,7 +129,6 @@ The function is parallelized with the TBB library.
If you are using the C version, make sure you call ``cv::initModule_nonfree()`` from ``nonfree/nonfree.hpp``. If you are using the C version, make sure you call ``cv::initModule_nonfree()`` from ``nonfree/nonfree.hpp``.
gpu::SURF_GPU gpu::SURF_GPU
------------- -------------
.. ocv:class:: gpu::SURF_GPU .. ocv:class:: gpu::SURF_GPU
@ -230,3 +229,102 @@ The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descripto
The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls. The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
.. seealso:: :ocv:class:`SURF` .. seealso:: :ocv:class:`SURF`
ocl::SURF_OCL
-------------
.. ocv:class:: ocl::SURF_OCL
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
class SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_OCL();
//! the full constructor taking all the necessary parameters
explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
void uploadKeypoints(const vector<KeyPoint>& keypoints,
oclMat& keypointsocl);
//! download keypoints from device to host memory
void downloadKeypoints(const oclMat& keypointsocl,
vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const oclMat& descriptorsocl,
vector<float>& descriptors);
void operator()(const oclMat& img, const oclMat& mask,
oclMat& keypoints);
void operator()(const oclMat& img, const oclMat& mask,
oclMat& keypoints, oclMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints, oclMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints,
std::vector<float>& descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
double hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
oclMat sum, mask1, maskSum, intBuffer;
oclMat det, trace;
oclMat maxPosBuffer;
};
The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
.. seealso:: :ocv:class:`SURF`

View File

@ -0,0 +1,124 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_NONFREE_OCL_HPP__
#define __OPENCV_NONFREE_OCL_HPP__
#include "opencv2/ocl/ocl.hpp"
namespace cv
{
namespace ocl
{
//! Speeded up robust features, port from GPU module.
////////////////////////////////// SURF //////////////////////////////////////////
class CV_EXPORTS SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_OCL();
//! the full constructor taking all the necessary parameters
explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
//! download keypoints from device to host memory
void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors);
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have nFeature cols and 6 rows
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
float hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
oclMat sum, mask1, maskSum, intBuffer;
oclMat det, trace;
oclMat maxPosBuffer;
};
}
}
#endif //__OPENCV_NONFREE_OCL_HPP__

View File

@ -1,3 +1,4 @@
#include "perf_precomp.hpp" #include "perf_precomp.hpp"
#include "opencv2/ts/gpu_perf.hpp"
CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo()) CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo())

View File

@ -9,14 +9,15 @@
#ifndef __OPENCV_PERF_PRECOMP_HPP__ #ifndef __OPENCV_PERF_PRECOMP_HPP__
#define __OPENCV_PERF_PRECOMP_HPP__ #define __OPENCV_PERF_PRECOMP_HPP__
#include "cvconfig.h"
#include "opencv2/opencv_modules.hpp"
#include "opencv2/ts/ts.hpp" #include "opencv2/ts/ts.hpp"
#include "opencv2/ts/gpu_perf.hpp"
#include "opencv2/nonfree/nonfree.hpp" #include "opencv2/nonfree/nonfree.hpp"
#include "opencv2/highgui/highgui.hpp" #include "opencv2/highgui/highgui.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_OCL
# include "opencv2/nonfree/ocl.hpp"
#endif
#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA) #if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
#include "opencv2/nonfree/gpu.hpp" #include "opencv2/nonfree/gpu.hpp"
#endif #endif

View File

@ -43,61 +43,69 @@
// //
//M*/ //M*/
#include "precomp.hpp" #include "perf_precomp.hpp"
#include <iomanip>
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCV_OCL
using namespace cv; using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
using namespace cvtest;
using namespace testing;
using namespace std; using namespace std;
#define FILTER_IMAGE "../../../samples/gpu/road.png" typedef perf::TestBaseWithParam<std::string> OCL_SURF;
TEST(SURF, Performance) #define SURF_IMAGES \
"cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
"stitching/a3.png"
PERF_TEST_P(OCL_SURF, DISABLED_with_data_transfer, testing::Values(SURF_IMAGES))
{ {
cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); string filename = getDataPath(GetParam());
Mat img = imread(filename, IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty()); ASSERT_FALSE(img.empty());
ocl::SURF_OCL d_surf; SURF_OCL d_surf;
ocl::oclMat d_keypoints; oclMat d_keypoints;
ocl::oclMat d_descriptors; oclMat d_descriptors;
Mat cpu_kp;
Mat cpu_dp;
double totalgputick = 0; declare.time(60);
double totalgputick_kernel = 0;
double t1 = 0; TEST_CYCLE()
double t2 = 0;
for(int j = 0; j < LOOP_TIMES + 1; j ++)
{ {
t1 = (double)cvGetTickCount();//gpu start1 oclMat d_src(img);
ocl::oclMat d_src(img);//upload d_surf(d_src, oclMat(), d_keypoints, d_descriptors);
t2 = (double)cvGetTickCount(); //kernel
d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
t2 = (double)cvGetTickCount() - t2;//kernel
cv::Mat cpu_kp, cpu_dp;
d_keypoints.download (cpu_kp);//download
d_descriptors.download (cpu_dp);//download
t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0)
continue;
totalgputick = t1 + totalgputick;
totalgputick_kernel = t2 + totalgputick_kernel;
d_keypoints.download(cpu_kp);
d_descriptors.download(cpu_dp);
} }
cout << "average gpu runtime is " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl; SANITY_CHECK(cpu_kp, 1);
cout << "average gpu runtime without data transfer is " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl; SANITY_CHECK(cpu_dp, 1);
} }
#endif //Have opencl
PERF_TEST_P(OCL_SURF, DISABLED_without_data_transfer, testing::Values(SURF_IMAGES))
{
string filename = getDataPath(GetParam());
Mat img = imread(filename, IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
SURF_OCL d_surf;
oclMat d_keypoints;
oclMat d_descriptors;
oclMat d_src(img);
declare.time(60);
TEST_CYCLE() d_surf(d_src, oclMat(), d_keypoints, d_descriptors);
Mat cpu_kp;
Mat cpu_dp;
d_keypoints.download(cpu_kp);
d_descriptors.download(cpu_dp);
SANITY_CHECK(cpu_kp, 1);
SANITY_CHECK(cpu_dp, 1);
}
#endif // HAVE_OPENCV_OCL

View File

@ -104,11 +104,11 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
// N = 2 // N = 2
// for simple haar paatern // for simple haar paatern
float icvCalcHaarPatternSum_2( float icvCalcHaarPatternSum_2(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float src[2][5],
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
int rows, int cols, int elemPerRow) int rows, int cols, int elemPerRow)
{ {
@ -137,11 +137,11 @@ float icvCalcHaarPatternSum_2(
// N = 3 // N = 3
float icvCalcHaarPatternSum_3( float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float src[2][5],
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
int rows, int cols, int elemPerRow) int rows, int cols, int elemPerRow)
{ {
@ -170,11 +170,11 @@ float icvCalcHaarPatternSum_3(
// N = 4 // N = 4
float icvCalcHaarPatternSum_4( float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex, IMAGE_INT32 sumTex,
__constant float src[2][5], __constant float src[2][5],
int oldSize, int oldSize,
int newSize, int newSize,
int y, int x, int y, int x,
int rows, int cols, int elemPerRow) int rows, int cols, int elemPerRow)
{ {
@ -265,7 +265,7 @@ __kernel void icvCalcLayerDetAndTrace(
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
} }
} }
@ -301,9 +301,9 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
// Non-maximal suppression to further filtering the candidates from previous step // Non-maximal suppression to further filtering the candidates from previous step
__kernel __kernel
void icvFindMaximaInLayer_withmask( void icvFindMaximaInLayer_withmask(
__global const float * det, __global const float * det,
__global const float * trace, __global const float * trace,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
volatile __global int* maxCounter, volatile __global int* maxCounter,
int counter_offset, int counter_offset,
int det_step, // the step of det in bytes int det_step, // the step of det in bytes
@ -345,26 +345,26 @@ __kernel
// Is this thread within the hessian buffer? // Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1); const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
N9[localLin - zoff] = N9[localLin - zoff] =
det[det_step * det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x + min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin ] = N9[localLin ] =
det[det_step * det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x + min(max(j, 0), c_img_cols - 1)]; // x
N9[localLin + zoff] = N9[localLin + zoff] =
det[det_step * det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x + min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin if (i < c_layer_rows - margin
&& j < c_layer_cols - margin && j < c_layer_cols - margin
&& get_local_id(0) > 0 && get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1 && get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0 && get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
) )
{ {
@ -429,9 +429,9 @@ __kernel
__kernel __kernel
void icvFindMaximaInLayer( void icvFindMaximaInLayer(
__global float * det, __global float * det,
__global float * trace, __global float * trace,
__global int4 * maxPosBuffer, __global int4 * maxPosBuffer,
volatile __global int* maxCounter, volatile __global int* maxCounter,
int counter_offset, int counter_offset,
int det_step, // the step of det in bytes int det_step, // the step of det in bytes
@ -474,19 +474,19 @@ __kernel
int l_x = min(max(j, 0), c_img_cols - 1); int l_x = min(max(j, 0), c_img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
N9[localLin - zoff] = N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x]; det[det_step * (l_y - c_layer_rows) + l_x];
N9[localLin ] = N9[localLin ] =
det[det_step * (l_y ) + l_x]; det[det_step * (l_y ) + l_x];
N9[localLin + zoff] = N9[localLin + zoff] =
det[det_step * (l_y + c_layer_rows) + l_x]; det[det_step * (l_y + c_layer_rows) + l_x];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (i < c_layer_rows - margin if (i < c_layer_rows - margin
&& j < c_layer_cols - margin && j < c_layer_cols - margin
&& get_local_id(0) > 0 && get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1 && get_local_id(0) < get_local_size(0) - 1
&& get_local_id(1) > 0 && get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
) )
{ {
@ -554,17 +554,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
{ {
F invdet = 1.0 / det; F invdet = 1.0 / det;
x[0] = invdet * x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
x[1] = invdet * x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
x[2] = invdet * x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
@ -585,9 +585,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// INTERPOLATION // INTERPOLATION
__kernel __kernel
void icvInterpolateKeypoint( void icvInterpolateKeypoint(
__global const float * det, __global const float * det,
__global const int4 * maxPosBuffer, __global const int4 * maxPosBuffer,
__global float * keypoints, __global float * keypoints,
volatile __global int * featureCounter, volatile __global int * featureCounter,
@ -617,7 +617,7 @@ __kernel
volatile __local float N9[3][3][3]; volatile __local float N9[3][3][3];
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
det[det_step * (c_layer_rows * layer + i) + j]; det[det_step * (c_layer_rows * layer + i) + j];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -715,27 +715,27 @@ __kernel
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f}; 0.001707611023448408f, 0.001455130288377404f};
@ -748,14 +748,20 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
data[tid] = *partial_reduction; data[tid] = *partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) if (tid < 16)
{
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); barrier(CLK_LOCAL_MEM_FENCE);
} if (tid < 1)
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
#undef op #undef op
} }
@ -958,8 +964,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
// utility for linear filter // utility for linear filter
inline uchar readerGet( inline uchar readerGet(
IMAGE_INT8 src, IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j, int rows, int cols, int elemPerRow int i, int j, int rows, int cols, int elemPerRow
) )
{ {
@ -969,8 +975,8 @@ inline uchar readerGet(
} }
inline float linearFilter( inline float linearFilter(
IMAGE_INT8 src, IMAGE_INT8 src,
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x, int rows, int cols, int elemPerRow float y, float x, int rows, int cols, int elemPerRow
) )
{ {
@ -1004,9 +1010,9 @@ void calc_dx_dy(
volatile __local float s_dx_bin[25], volatile __local float s_dx_bin[25],
volatile __local float s_dy_bin[25], volatile __local float s_dy_bin[25],
volatile __local float s_PATCH[6][6], volatile __local float s_PATCH[6][6],
__global const float* featureX, __global const float* featureX,
__global const float* featureY, __global const float* featureY,
__global const float* featureSize, __global const float* featureSize,
__global const float* featureDir, __global const float* featureDir,
int rows, int rows,
int cols, int cols,
@ -1058,26 +1064,26 @@ void calc_dx_dy(
const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = ( const float vx = (
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] + s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
* dw; * dw;
const float vy = ( const float vy = (
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
s_PATCH[get_local_id(1) ][get_local_id(0) ] + s_PATCH[get_local_id(1) ][get_local_id(0) ] +
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
* dw; * dw;
s_dx_bin[tid] = vx; s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy; s_dy_bin[tid] = vy;
} }
} }
void reduce_sum25( void reduce_sum25(
volatile __local float* sdata1, volatile __local float* sdata1,
volatile __local float* sdata2, volatile __local float* sdata2,
volatile __local float* sdata3, volatile __local float* sdata3,
volatile __local float* sdata4, volatile __local float* sdata4,
int tid int tid
) )
{ {
@ -1115,13 +1121,13 @@ void reduce_sum25(
} }
} }
__kernel __kernel
void compute_descriptors64( void compute_descriptors64(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
volatile __global float * descriptors, volatile __global float * descriptors,
__global const float * keypoints, __global const float * keypoints,
int descriptors_step, int descriptors_step,
int keypoints_step, int keypoints_step,
int rows, int rows,
int cols, int cols,
int img_step int img_step
@ -1155,7 +1161,7 @@ __kernel
if (tid < 25) if (tid < 25)
{ {
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25) if (tid < 25)
{ {
@ -1171,10 +1177,10 @@ __kernel
} }
} }
} }
__kernel __kernel
void compute_descriptors128( void compute_descriptors128(
IMAGE_INT8 imgTex, IMAGE_INT8 imgTex,
__global volatile float * descriptors, __global volatile float * descriptors,
__global float * keypoints, __global float * keypoints,
int descriptors_step, int descriptors_step,
int keypoints_step, int keypoints_step,
@ -1269,7 +1275,7 @@ __kernel
} }
} }
__kernel __kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step) void normalize_descriptors128(__global float * descriptors, int descriptors_step)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);
@ -1310,7 +1316,7 @@ __kernel
// normalize and store in output // normalize and store in output
descriptor_base[get_local_id(0)] = lookup / len; descriptor_base[get_local_id(0)] = lookup / len;
} }
__kernel __kernel
void normalize_descriptors64(__global float * descriptors, int descriptors_step) void normalize_descriptors64(__global float * descriptors, int descriptors_step)
{ {
descriptors_step /= sizeof(*descriptors); descriptors_step /= sizeof(*descriptors);

View File

@ -66,4 +66,9 @@
#endif #endif
#endif #endif
#ifdef HAVE_OPENCV_OCL
# include "opencv2/nonfree/ocl.hpp"
# include "opencv2/ocl/private/util.hpp"
#endif
#endif #endif

View File

@ -42,10 +42,9 @@
// the use of this software, even if advised of the possibility of such damage. // the use of this software, even if advised of the possibility of such damage.
// //
//M*/ //M*/
#include <iomanip>
#include "precomp.hpp" #include "precomp.hpp"
#include "mcwutil.hpp"
//#include "opencv2/highgui/highgui.hpp" #ifdef HAVE_OPENCV_OCL
using namespace cv; using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
@ -56,7 +55,7 @@ namespace cv
namespace ocl namespace ocl
{ {
///////////////////////////OpenCL kernel strings/////////////////////////// ///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *nonfree_surf; extern const char *surf;
const char* noImage2dOption = "-D DISABLE_IMAGE2D"; const char* noImage2dOption = "-D DISABLE_IMAGE2D";
@ -76,10 +75,11 @@ namespace cv
} }
static inline int divUp(int total, int grain) static inline int divUp(size_t total, size_t grain)
{ {
return (total + grain - 1) / grain; return (total + grain - 1) / grain;
} }
static inline int calcSize(int octave, int layer) static inline int calcSize(int octave, int layer)
{ {
/* Wavelet size at first layer of first octave. */ /* Wavelet size at first layer of first octave. */
@ -268,7 +268,7 @@ private:
int maxFeatures; int maxFeatures;
oclMat counters; oclMat counters;
// texture buffers // texture buffers
cl_mem imgTex; cl_mem imgTex;
cl_mem sumTex; cl_mem sumTex;
@ -506,20 +506,20 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] = size_t globalThreads[3] =
{ {
divUp(max_samples_j, localThreads[0]) *localThreads[0], divUp(max_samples_j, localThreads[0]) * localThreads[0],
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2), divUp(max_samples_i, localThreads[1]) * localThreads[1] *(nOctaveLayers + 2),
1 1
}; };
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset, void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols) int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
{ {
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
Context *clCxt = det.clCxt; Context *clCxt = det.clCxt;
string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer"; string kernelName = useMask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
@ -538,7 +538,7 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates)); args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold)); args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
if(use_mask) if(useMask)
{ {
if(maskSumTex) if(maskSumTex)
{ {
@ -556,11 +556,11 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
1 1
}; };
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter, void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures) oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
{ {
Context *clCxt = det.clCxt; Context *clCxt = det.clCxt;
string kernelName = "icvInterpolateKeypoint"; string kernelName = "icvInterpolateKeypoint";
@ -569,19 +569,19 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&counters_.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&octave)); args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures)); args.push_back( make_pair( sizeof(cl_int), (void *)&max_features));
size_t localThreads[3] = {3, 3, 3}; size_t localThreads[3] = {3, 3, 3};
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1}; size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures) void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
@ -608,7 +608,7 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
size_t localThreads[3] = {32, 4, 1}; size_t localThreads[3] = {32, 4, 1};
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1}; size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures) void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
@ -625,7 +625,7 @@ void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
size_t localThreads[3] = {256, 1, 1}; size_t localThreads[3] = {256, 1, 1};
size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1}; size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
@ -633,7 +633,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
{ {
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
Context *clCxt = descriptors.clCxt; Context *clCxt = descriptors.clCxt;
string kernelName = ""; string kernelName;
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
size_t localThreads[3] = {1, 1, 1}; size_t localThreads[3] = {1, 1, 1};
size_t globalThreads[3] = {1, 1, 1}; size_t globalThreads[3] = {1, 1, 1};
@ -665,7 +665,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
kernelName = "normalize_descriptors64"; kernelName = "normalize_descriptors64";
@ -679,7 +679,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
else else
{ {
@ -707,8 +707,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
kernelName = "normalize_descriptors128"; kernelName = "normalize_descriptors128";
@ -721,7 +721,9 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
args.clear(); args.clear();
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
} }
} }
#endif //HAVE_OPENCV_OCL

View File

@ -7,7 +7,7 @@ using namespace cv::gpu;
using namespace cvtest; using namespace cvtest;
using namespace testing; using namespace testing;
int main(int argc, char** argv) int main(int argc, char **argv)
{ {
try try
{ {
@ -23,42 +23,42 @@ int main(int argc, char** argv)
{ {
cmd.printParams(); cmd.printParams();
return 0; return 0;
} }
printCudaInfo(); printCudaInfo();
if (cmd.get<bool>("info")) if (cmd.get<bool>("info"))
{ {
return 0; return 0;
} }
int device = cmd.get<int>("device"); int device = cmd.get<int>("device");
if (device < 0) if (device < 0)
{ {
DeviceManager::instance().loadAll(); DeviceManager::instance().loadAll();
std::cout << "Run tests on all supported devices \n" << std::endl; std::cout << "Run tests on all supported devices \n" << std::endl;
} }
else else
{ {
DeviceManager::instance().load(device); DeviceManager::instance().load(device);
DeviceInfo info(device); DeviceInfo info(device);
std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl; std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl;
} }
TS::ptr()->init("cv"); TS::ptr()->init("cv");
InitGoogleTest(&argc, argv); InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
catch (const std::exception& e) catch (const std::exception& e)
{ {
std::cerr << e.what() << std::endl; std::cerr << e.what() << std::endl;
return -1; return -1;
} }
catch (...) catch (...)
{ {
std::cerr << "Unknown error" << std::endl; std::cerr << "Unknown error" << std::endl;
return -1; return -1;
} }

View File

@ -9,16 +9,16 @@
#ifndef __OPENCV_TEST_PRECOMP_HPP__ #ifndef __OPENCV_TEST_PRECOMP_HPP__
#define __OPENCV_TEST_PRECOMP_HPP__ #define __OPENCV_TEST_PRECOMP_HPP__
#include <iostream>
#include "cvconfig.h"
#include "opencv2/opencv_modules.hpp"
#include "opencv2/ts/ts.hpp" #include "opencv2/ts/ts.hpp"
#include "opencv2/imgproc/imgproc.hpp" #include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp" #include "opencv2/highgui/highgui.hpp"
#include "opencv2/nonfree/nonfree.hpp" #include "opencv2/nonfree/nonfree.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_OCL
# include "opencv2/nonfree/ocl.hpp"
#endif
#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA) #if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
#include "opencv2/ts/gpu_test.hpp" #include "opencv2/ts/gpu_test.hpp"
#include "opencv2/nonfree/gpu.hpp" #include "opencv2/nonfree/gpu.hpp"

View File

@ -43,20 +43,19 @@
// //
//M*/ //M*/
#include "test_precomp.hpp"
#include "precomp.hpp" #ifdef HAVE_OPENCV_OCL
#ifdef HAVE_OPENCL
extern std::string workdir;
using namespace std; using namespace std;
using std::tr1::get;
static bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2) static bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
{ {
const double maxPtDif = 1.0; const double maxPtDif = 0.1;
const double maxSizeDif = 1.0; const double maxSizeDif = 0.1;
const double maxAngleDif = 2.0; const double maxAngleDif = 0.1;
const double maxResponseDif = 0.1; const double maxResponseDif = 0.01;
double dist = cv::norm(p1.pt - p2.pt); double dist = cv::norm(p1.pt - p2.pt);
@ -73,22 +72,10 @@ static bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
return false; return false;
} }
struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
{
bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
{
return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
}
};
#define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
static int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual) static int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
{ {
std::sort(actual.begin(), actual.end(), KeyPointLess()); std::sort(actual.begin(), actual.end(), perf::comparators::KeypointGreater());
std::sort(gold.begin(), gold.end(), KeyPointLess()); std::sort(gold.begin(), gold.end(), perf::comparators::KeypointGreater());
int validCount = 0; int validCount = 0;
@ -122,13 +109,24 @@ static int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, co
return validCount; return validCount;
} }
IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double) #define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
IMPLEMENT_PARAM_CLASS(SURF_Octaves, int) #define IMPLEMENT_PARAM_CLASS(name, type) \
IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int) namespace { class name { \
IMPLEMENT_PARAM_CLASS(SURF_Extended, bool) public: \
IMPLEMENT_PARAM_CLASS(SURF_Upright, bool) name ( type arg = type ()) : val_(arg) {} \
operator type () const {return val_;} \
private: \
type val_; \
}; \
inline void PrintTo( name param, std::ostream* os) {*os << #name << "=" << testing::PrintToString(static_cast< type >(param));}}
PARAM_TEST_CASE(SURF, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright) IMPLEMENT_PARAM_CLASS(HessianThreshold, double)
IMPLEMENT_PARAM_CLASS(Octaves, int)
IMPLEMENT_PARAM_CLASS(OctaveLayers, int)
IMPLEMENT_PARAM_CLASS(Extended, bool)
IMPLEMENT_PARAM_CLASS(Upright, bool)
PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright)
{ {
double hessianThreshold; double hessianThreshold;
int nOctaves; int nOctaves;
@ -138,16 +136,17 @@ PARAM_TEST_CASE(SURF, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SU
virtual void SetUp() virtual void SetUp()
{ {
hessianThreshold = GET_PARAM(0); hessianThreshold = get<0>(GetParam());
nOctaves = GET_PARAM(1); nOctaves = get<1>(GetParam());
nOctaveLayers = GET_PARAM(2); nOctaveLayers = get<2>(GetParam());
extended = GET_PARAM(3); extended = get<3>(GetParam());
upright = GET_PARAM(4); upright = get<4>(GetParam());
} }
}; };
TEST_P(SURF, Detector)
TEST_P(SURF, DISABLED_Detector)
{ {
cv::Mat image = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE); cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty()); ASSERT_FALSE(image.empty());
cv::ocl::SURF_OCL surf; cv::ocl::SURF_OCL surf;
@ -175,12 +174,12 @@ TEST_P(SURF, Detector)
int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints); int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size(); double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
EXPECT_GT(matchedRatio, 0.95); EXPECT_GT(matchedRatio, 0.99);
} }
TEST_P(SURF, Descriptor) TEST_P(SURF, DISABLED_Descriptor)
{ {
cv::Mat image = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE); cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty()); ASSERT_FALSE(image.empty());
cv::ocl::SURF_OCL surf; cv::ocl::SURF_OCL surf;
@ -218,10 +217,10 @@ TEST_P(SURF, Descriptor)
} }
INSTANTIATE_TEST_CASE_P(OCL_Features2D, SURF, testing::Combine( INSTANTIATE_TEST_CASE_P(OCL_Features2D, SURF, testing::Combine(
testing::Values(/*SURF_HessianThreshold(100.0), */SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)), testing::Values(HessianThreshold(500.0), HessianThreshold(1000.0)),
testing::Values(SURF_Octaves(3), SURF_Octaves(4)), testing::Values(Octaves(3), Octaves(4)),
testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)), testing::Values(OctaveLayers(2), OctaveLayers(3)),
testing::Values(SURF_Extended(false), SURF_Extended(true)), testing::Values(Extended(false), Extended(true)),
testing::Values(SURF_Upright(false), SURF_Upright(true)))); testing::Values(Upright(false), Upright(true))));
#endif #endif // HAVE_OPENCV_OCL

View File

@ -1,69 +1,7 @@
# Will be modified later
if(NOT HAVE_OPENCL) if(NOT HAVE_OPENCL)
ocv_module_disable(ocl) ocv_module_disable(ocl)
endif() endif()
set(the_description "OpenCL-accelerated Computer Vision") set(the_description "OpenCL-accelerated Computer Vision")
ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree) ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video)
ocv_module_include_directories()
file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
add_custom_command(
OUTPUT ${kernels_cpp}
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
DEPENDS ${CL_FILES} ${cl2cpp_script})
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
source_group("Include" FILES ${lib_hdrs})
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
if (HAVE_OPENCL)
set(ocl_link_libs ${OPENCL_LIBRARIES})
if(OPENCL_INCLUDE_DIR)
ocv_include_directories(${OPENCL_INCLUDE_DIR})
endif()
if (HAVE_CLAMDFFT)
set(ocl_link_libs ${ocl_link_libs} ${CLAMDFFT_LIBRARIES})
ocv_include_directories(${CLAMDFFT_INCLUDE_DIR})
endif()
if (HAVE_CLAMDBLAS)
set(ocl_link_libs ${ocl_link_libs} ${CLAMDBLAS_LIBRARIES})
ocv_include_directories(${CLAMDBLAS_INCLUDE_DIR})
endif()
endif()
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
ocv_create_module(${ocl_link_libs})
install(FILES ${lib_hdrs}
DESTINATION include/opencv2/${name}
COMPONENT main)
ocv_add_precompiled_headers(${the_module})
################################################################################################################
################################ OpenCL Module Tests ##################################################
################################################################################################################
file(GLOB test_srcs "test/*.cpp")
file(GLOB test_hdrs "test/*.hpp" "test/*.h")
ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
FILES "Src" ${test_srcs})
################################################################################################################
################################ OpenCL Module Performance ##################################################
################################################################################################################
file(GLOB perf_srcs "perf/*.cpp")
file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
FILES "Src" ${perf_srcs})

View File

@ -88,102 +88,3 @@ Computes a proximity map for a raster template and an image where the template i
* ``CV_TM_CCORR`` * ``CV_TM_CCORR``
.. seealso:: :ocv:func:`matchTemplate` .. seealso:: :ocv:func:`matchTemplate`
ocl::SURF_OCL
-------------
.. ocv:class:: ocl::SURF_OCL
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
class SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_OCL();
//! the full constructor taking all the necessary parameters
explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
void uploadKeypoints(const vector<KeyPoint>& keypoints,
oclMat& keypointsocl);
//! download keypoints from device to host memory
void downloadKeypoints(const oclMat& keypointsocl,
vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const oclMat& descriptorsocl,
vector<float>& descriptors);
void operator()(const oclMat& img, const oclMat& mask,
oclMat& keypoints);
void operator()(const oclMat& img, const oclMat& mask,
oclMat& keypoints, oclMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints, oclMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat& img, const oclMat& mask,
std::vector<KeyPoint>& keypoints,
std::vector<float>& descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
double hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
oclMat sum, mask1, maskSum, intBuffer;
oclMat det, trace;
oclMat maxPosBuffer;
};
The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
.. seealso:: :ocv:class:`SURF`

View File

@ -69,28 +69,28 @@ namespace cv
enum DevMemRW enum DevMemRW
{ {
DEVICE_MEM_R_W = 0, DEVICE_MEM_R_W = 0,
DEVICE_MEM_R_ONLY, DEVICE_MEM_R_ONLY,
DEVICE_MEM_W_ONLY DEVICE_MEM_W_ONLY
}; };
enum DevMemType enum DevMemType
{ {
DEVICE_MEM_DEFAULT = 0, DEVICE_MEM_DEFAULT = 0,
DEVICE_MEM_AHP, //alloc host pointer DEVICE_MEM_AHP, //alloc host pointer
DEVICE_MEM_UHP, //use host pointer DEVICE_MEM_UHP, //use host pointer
DEVICE_MEM_CHP, //copy host pointer DEVICE_MEM_CHP, //copy host pointer
DEVICE_MEM_PM //persistent memory DEVICE_MEM_PM //persistent memory
}; };
//Get the global device memory and read/write type //Get the global device memory and read/write type
//return 1 if unified memory system supported, otherwise return 0 //return 1 if unified memory system supported, otherwise return 0
CV_EXPORTS int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type); CV_EXPORTS int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type);
//Set the global device memory and read/write type, //Set the global device memory and read/write type,
//the newly generated oclMat will all use this type //the newly generated oclMat will all use this type
//return -1 if the target type is unsupported, otherwise return 0 //return -1 if the target type is unsupported, otherwise return 0
CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT); CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
//this class contains ocl runtime information //this class contains ocl runtime information
class CV_EXPORTS Info class CV_EXPORTS Info
@ -135,20 +135,28 @@ namespace cv
//////////////////////////////// OpenCL context //////////////////////// //////////////////////////////// OpenCL context ////////////////////////
//This is a global singleton class used to represent a OpenCL context. //This is a global singleton class used to represent a OpenCL context.
class Context class CV_EXPORTS Context
{ {
protected: protected:
Context(); Context();
friend class auto_ptr<Context>; friend class auto_ptr<Context>;
static auto_ptr<Context> clCxt;
private:
static auto_ptr<Context> clCxt;
static int val;
public: public:
~Context(); ~Context();
static int val; void release();
static Context *getContext(); Info::Impl* impl;
static Context* getContext();
static void setContext(Info &oclinfo); static void setContext(Info &oclinfo);
struct Impl;
Impl *impl; enum {CL_DOUBLE, CL_UNIFIED_MEM};
bool supportsFeature(int ftype);
size_t computeUnits();
void* oclContext();
void* oclCommandQueue();
}; };
//! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing. //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
@ -1073,156 +1081,6 @@ namespace cv
}; };
//! Speeded up robust features, port from GPU module.
////////////////////////////////// SURF //////////////////////////////////////////
class CV_EXPORTS SURF_OCL
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_OCL();
//! the full constructor taking all the necessary parameters
explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
//! download keypoints from device to host memory
void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors);
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have nFeature cols and 6 rows
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
bool useProvidedKeypoints = false);
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
float hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
oclMat sum, mask1, maskSum, intBuffer;
oclMat det, trace;
oclMat maxPosBuffer;
};
////////////////////////feature2d_ocl///////////////// ////////////////////////feature2d_ocl/////////////////
/****************************************************************************************\ /****************************************************************************************\
* Distance * * Distance *

View File

@ -0,0 +1,130 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_OCL_PRIVATE_UTIL__
#define __OPENCV_OCL_PRIVATE_UTIL__
#include "opencv2/ocl/ocl.hpp"
#if defined __APPLE__
#include <OpenCL/OpenCL.h>
#else
#include <CL/opencl.h>
#endif
namespace cv
{
namespace ocl
{
enum openCLMemcpyKind
{
clMemcpyHostToDevice = 0,
clMemcpyDeviceToHost,
clMemcpyDeviceToDevice
};
///////////////////////////OpenCL call wrappers////////////////////////////
void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height);
void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch,
size_t width, size_t height, int src_offset);
void CV_EXPORTS openCLFree(void *devPtr);
cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
const char **source, std::string kernelName);
cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
const char **source, std::string kernelName, const char *build_options);
void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
size_t globalThreads[3], size_t localThreads[3],
std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
int depth, const char *build_options);
cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
const size_t size);
cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
int CV_EXPORTS savetofile(const Context *clcxt, cl_program &program, const char *fileName);
enum FLUSH_MODE
{
CLFINISH = 0,
CLFLUSH,
DISABLE
};
void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
// bind oclMat to OpenCL image textures
// note:
// 1. there is no memory management. User need to explicitly release the resource
// 2. for faster clamping, there is no buffer padding for the constructed texture
cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
void CV_EXPORTS releaseTexture(cl_mem& texture);
// returns whether the current context supports image2d_t format or not
bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
}//namespace ocl
}//namespace cv
#endif //__OPENCV_OCL_PRIVATE_UTIL__

View File

@ -132,7 +132,7 @@ inline int divUp(int total, int grain)
template<typename T> template<typename T>
void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString, void *_scalar)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -195,7 +195,7 @@ static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
} }
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString) static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -272,7 +272,7 @@ typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst,
void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
{ {
if((src1.clCxt -> impl -> double_support != 0) && (src1.depth() == CV_64F)) if(src1.clCxt->supportsFeature(Context::CL_DOUBLE) && (src1.depth() == CV_64F))
arithmetic_run<double>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar)); arithmetic_run<double>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
else else
arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar)); arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
@ -280,7 +280,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
{ {
if(src1.clCxt -> impl -> double_support != 0) if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
else else
arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
@ -289,7 +289,7 @@ void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double
template <typename WT , typename CL_WT> template <typename WT , typename CL_WT>
void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar) void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -361,7 +361,7 @@ void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString, double scalar) static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString, double scalar)
{ {
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -405,7 +405,7 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelN
args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
if(src.clCxt -> impl -> double_support != 0) if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar )); args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
else else
{ {
@ -464,7 +464,7 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons
} }
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
{ {
if(src.clCxt -> impl -> double_support == 0) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -524,7 +524,7 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, str
void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp) void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -599,7 +599,7 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
template <typename T> template <typename T>
Scalar arithmetic_sum(const oclMat &src, int type = 0) Scalar arithmetic_sum(const oclMat &src, int type = 0)
{ {
size_t groupnum = src.clCxt->impl->maxComputeUnits; size_t groupnum = src.clCxt->computeUnits();
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen; int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
@ -627,7 +627,7 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
typedef Scalar (*sumFunc)(const oclMat &src, int type); typedef Scalar (*sumFunc)(const oclMat &src, int type);
Scalar cv::ocl::sum(const oclMat &src) Scalar cv::ocl::sum(const oclMat &src)
{ {
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -638,13 +638,13 @@ Scalar cv::ocl::sum(const oclMat &src)
}; };
sumFunc func; sumFunc func;
func = functab[src.clCxt->impl->double_support]; func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
return func(src, 0); return func(src, 0);
} }
Scalar cv::ocl::absSum(const oclMat &src) Scalar cv::ocl::absSum(const oclMat &src)
{ {
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -655,13 +655,13 @@ Scalar cv::ocl::absSum(const oclMat &src)
}; };
sumFunc func; sumFunc func;
func = functab[src.clCxt->impl->double_support]; func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
return func(src, 1); return func(src, 1);
} }
Scalar cv::ocl::sqrSum(const oclMat &src) Scalar cv::ocl::sqrSum(const oclMat &src)
{ {
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -672,7 +672,7 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
}; };
sumFunc func; sumFunc func;
func = functab[src.clCxt->impl->double_support]; func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
return func(src, 2); return func(src, 2);
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@ -771,7 +771,7 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask) template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
{ {
size_t groupnum = src.clCxt->impl->maxComputeUnits; size_t groupnum = src.clCxt->computeUnits();
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
groupnum = groupnum * 2; groupnum = groupnum * 2;
int vlen = 8; int vlen = 8;
@ -810,7 +810,7 @@ typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, co
void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask) void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
{ {
CV_Assert(src.oclchannels() == 1); CV_Assert(src.oclchannels() == 1);
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -894,7 +894,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName) static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
{ {
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -943,7 +943,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern
} }
static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical) static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
{ {
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -1123,7 +1123,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel
CV_Assert( src.type() == CV_32F || src.type() == CV_64F); CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
if(clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if(!clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -1164,7 +1164,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName) static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -1212,7 +1212,7 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString) static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -1276,7 +1276,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart, static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
string kernelName, bool angleInDegrees) string kernelName, bool angleInDegrees)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -1331,7 +1331,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees, static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
string kernelName) string kernelName)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -1452,7 +1452,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
Point *minLoc, Point *maxLoc, const oclMat &mask) Point *minLoc, Point *maxLoc, const oclMat &mask)
{ {
CV_Assert(src.oclchannels() == 1); CV_Assert(src.oclchannels() == 1);
size_t groupnum = src.clCxt->impl->maxComputeUnits; size_t groupnum = src.clCxt->computeUnits();
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
int minloc = -1 , maxloc = -1; int minloc = -1 , maxloc = -1;
int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ; int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
@ -1513,7 +1513,7 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal, void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
Point *minLoc, Point *maxLoc, const oclMat &mask) Point *minLoc, Point *maxLoc, const oclMat &mask)
{ {
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -1524,7 +1524,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
}; };
minMaxLocFunc func; minMaxLocFunc func;
func = functab[src.clCxt->impl->double_support]; func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
func(src, minVal, maxVal, minLoc, maxLoc, mask); func(src, minVal, maxVal, minLoc, maxLoc, mask);
} }
@ -1559,8 +1559,8 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
int cv::ocl::countNonZero(const oclMat &src) int cv::ocl::countNonZero(const oclMat &src)
{ {
size_t groupnum = src.clCxt->impl->maxComputeUnits; size_t groupnum = src.clCxt->computeUnits();
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -1845,7 +1845,7 @@ static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
{ {
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -1858,7 +1858,7 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
{ {
// dst.create(src1.size(),src1.type()); // dst.create(src1.size(),src1.type());
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -1874,7 +1874,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co
void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -1889,7 +1889,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co
void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
{ {
// dst.create(src1.size(),src1.type()); // dst.create(src1.size(),src1.type());
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -1906,7 +1906,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c
void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -1920,7 +1920,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c
void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -1939,7 +1939,7 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c
void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;
@ -2036,7 +2036,7 @@ oclMatExpr::operator oclMat() const
#define BLOCK_ROWS (256/TILE_DIM) #define BLOCK_ROWS (256/TILE_DIM)
static void transpose_run(const oclMat &src, oclMat &dst, string kernelName) static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
{ {
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;
@ -2135,7 +2135,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step )); args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
if(src1.clCxt -> impl -> double_support != 0) if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha )); args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
args.push_back( make_pair( sizeof(cl_double), (void *)&beta )); args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
@ -2282,7 +2282,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
if(src1.clCxt -> impl -> double_support == 0) if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
float pf = p; float pf = p;
args.push_back( make_pair( sizeof(cl_float), (void *)&pf )); args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
@ -2294,7 +2294,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
} }
void cv::ocl::pow(const oclMat &x, double p, oclMat &y) void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
{ {
if(x.clCxt -> impl -> double_support == 0 && x.type() == CV_64F) if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
{ {
cout << "Selected device do not support double" << endl; cout << "Selected device do not support double" << endl;
return; return;

View File

@ -43,9 +43,7 @@
// //
//M*/ //M*/
#include <iomanip>
#include "precomp.hpp" #include "precomp.hpp"
#include "mcwutil.hpp"
using namespace cv; using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
@ -100,7 +98,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
{ {
openCLFree(counter); openCLFree(counter);
} }
counter = clCreateBuffer( Context::getContext()->impl->clContext, CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err ); counter = clCreateBuffer( (cl_context)getoclContext(), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
openCLSafeCall(err); openCLSafeCall(err);
} }
@ -356,7 +354,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols) void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
{ {
unsigned int count; unsigned int count;
openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL)); openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
Context *clCxt = map.clCxt; Context *clCxt = map.clCxt;
string kernelName = "edgesHysteresisGlobal"; string kernelName = "edgesHysteresisGlobal";
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
@ -366,7 +364,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
int count_i[1] = {0}; int count_i[1] = {0};
while(count > 0) while(count > 0)
{ {
openCLSafeCall(clEnqueueWriteBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
args.clear(); args.clear();
size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1}; size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
@ -381,7 +379,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE); openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
std::swap(st1, st2); std::swap(st1, st2);
} }
#undef DIVUP #undef DIVUP

View File

@ -206,7 +206,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
clStridesIn[2] = is_row_dft ? clStridesIn[1] : dft_size.width * clStridesIn[1]; clStridesIn[2] = is_row_dft ? clStridesIn[1] : dft_size.width * clStridesIn[1];
clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1]; clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, Context::getContext()->impl->clContext, dim, clLengthsIn ) ); openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, (cl_context)getoclContext(), dim, clLengthsIn ) );
openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) ); openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) ); openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
@ -220,7 +220,8 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
openCLSafeCall( clAmdFftSetPlanScale ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) ); openCLSafeCall( clAmdFftSetPlanScale ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
//ready to bake //ready to bake
openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &(Context::getContext()->impl->clCmdQueue), NULL, NULL ) ); cl_command_queue clq = (cl_command_queue)getoclCommandQueue();
openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &clq, NULL, NULL ) );
} }
cv::ocl::FftPlan::~FftPlan() cv::ocl::FftPlan::~FftPlan()
{ {
@ -338,16 +339,17 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
if (buffersize) if (buffersize)
{ {
cl_int medstatus; cl_int medstatus;
clMedBuffer = clCreateBuffer ( src.clCxt->impl->clContext, CL_MEM_READ_WRITE, buffersize, 0, &medstatus); clMedBuffer = clCreateBuffer ( (cl_context)src.clCxt->oclContext(), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
openCLSafeCall( medstatus ); openCLSafeCall( medstatus );
} }
cl_command_queue clq = (cl_command_queue)src.clCxt->oclCommandQueue();
openCLSafeCall( clAmdFftEnqueueTransform( plHandle, openCLSafeCall( clAmdFftEnqueueTransform( plHandle,
is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
1, 1,
&src.clCxt->impl->clCmdQueue, &clq,
0, NULL, NULL, 0, NULL, NULL,
(cl_mem *)&src.data, (cl_mem *)&dst.data, clMedBuffer ) ); (cl_mem *)&src.data, (cl_mem *)&dst.data, clMedBuffer ) );
openCLSafeCall( clFinish(src.clCxt->impl->clCmdQueue) ); openCLSafeCall( clFinish(clq) );
if(clMedBuffer) if(clMedBuffer)
{ {
openCLFree(clMedBuffer); openCLFree(clMedBuffer);

View File

@ -48,8 +48,7 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include "mcwutil.hpp"
#include <iostream>
using namespace std; using namespace std;
using namespace cv; using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
@ -1479,7 +1478,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale) void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
{ {
if (src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return; return;

View File

@ -87,7 +87,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
int offb = src2.offset; int offb = src2.offset;
int offc = dst.offset; int offc = dst.offset;
cl_command_queue clq = (cl_command_queue)src1.clCxt->oclCommandQueue();
switch(src1.type()) switch(src1.type())
{ {
case CV_32FC1: case CV_32FC1:
@ -97,11 +97,12 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
offa /= sizeof(float); offa /= sizeof(float);
offb /= sizeof(float); offb /= sizeof(float);
offc /= sizeof(float); offc /= sizeof(float);
openCLSafeCall openCLSafeCall
( (
clAmdBlasSgemmEx(order, transA, transB, M, N, K, clAmdBlasSgemmEx(order, transA, transB, M, N, K,
alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
); );
break; break;
case CV_64FC1: case CV_64FC1:
@ -115,7 +116,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
( (
clAmdBlasDgemmEx(order, transA, transB, M, N, K, clAmdBlasDgemmEx(order, transA, transB, M, N, K,
alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
); );
break; break;
case CV_32FC2: case CV_32FC2:
@ -132,7 +133,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
( (
clAmdBlasCgemmEx(order, transA, transB, M, N, K, clAmdBlasCgemmEx(order, transA, transB, M, N, K,
alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
); );
} }
break; break;
@ -150,7 +151,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
( (
clAmdBlasZgemmEx(order, transA, transB, M, N, K, clAmdBlasZgemmEx(order, transA, transB, M, N, K,
alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL) beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
); );
} }
break; break;

View File

@ -971,7 +971,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t blocksize = 8; size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 }; size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->impl->maxComputeUnits) *localThreads[0], size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0],
localThreads[1], 1 localThreads[1], 1
}; };
int outputsz = 256 * globalThreads[0] / localThreads[0]; int outputsz = 256 * globalThreads[0] / localThreads[0];
@ -1047,21 +1047,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count); stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
//openCLVerifyCall(status); //openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status); //classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL); //status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode)); nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
//openCLVerifyCall(status); //openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, nodebuffer, 1, 0, openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode), nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL)); node, 0, NULL, NULL));
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz); candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
//openCLVerifyCall(status); //openCLVerifyCall(status);
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
//openCLVerifyCall(status); //openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
//flag = 1; //flag = 1;
//} //}
@ -1186,7 +1186,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int grp_per_CU = 12; int grp_per_CU = 12;
size_t blocksize = 8; size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 }; size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->impl->maxComputeUnits *localThreads[0], size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
localThreads[1], 1 localThreads[1], 1
}; };
int outputsz = 256 * globalThreads[0] / localThreads[0]; int outputsz = 256 * globalThreads[0] / localThreads[0];
@ -1195,7 +1195,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
nodenum * sizeof(GpuHidHaarTreeNode)); nodenum * sizeof(GpuHidHaarTreeNode));
//openCLVerifyCall(status); //openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, nodebuffer, 1, 0, openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode), nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL)); node, 0, NULL, NULL));
cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE, cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
@ -1252,16 +1252,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int splitnode = stage[0].count + stage[1].count + stage[2].count; int splitnode = stage[0].count + stage[1].count + stage[2].count;
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count); stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
//openCLVerifyCall(status); //openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz); candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
//openCLVerifyCall(status); //openCLVerifyCall(status);
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
//openCLVerifyCall(status); //openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount); pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount); correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
//int argcount = 0; //int argcount = 0;
vector<pair<size_t, const void *> > args; vector<pair<size_t, const void *> > args;
@ -1286,7 +1286,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1); openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL)); //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
candidate = (int *)clEnqueueMapBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status); candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
for(int i = 0; i < outputsz; i++) for(int i = 0; i < outputsz; i++)
{ {
@ -1297,7 +1297,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
free(scaleinfo); free(scaleinfo);
free(p); free(p);
free(correction); free(correction);
clEnqueueUnmapMemObject(gsum.clCxt->impl->clCmdQueue, candidatebuffer, candidate, 0, 0, 0); clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0);
openCLSafeCall(clReleaseMemObject(stagebuffer)); openCLSafeCall(clReleaseMemObject(stagebuffer));
openCLSafeCall(clReleaseMemObject(scaleinfobuffer)); openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer)); openCLSafeCall(clReleaseMemObject(nodebuffer));

View File

@ -44,7 +44,7 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include "mcwutil.hpp"
using namespace cv; using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
using namespace std; using namespace std;

View File

@ -290,8 +290,8 @@ namespace cv
args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]}; float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
if(src.clCxt -> impl -> double_support != 0) if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue)); args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
} }
@ -319,7 +319,7 @@ namespace cv
args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
if(src.clCxt -> impl -> double_support != 0) if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue)); args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
} }
@ -383,7 +383,7 @@ namespace cv
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
if(src.clCxt -> impl -> double_support != 0) if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
@ -824,12 +824,12 @@ namespace cv
string kernelName = "warpAffine" + s[interpolation]; string kernelName = "warpAffine" + s[interpolation];
if(src.clCxt -> impl -> double_support != 0) if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
cl_int st; cl_int st;
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st ); coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
openCLVerifyCall(st); openCLVerifyCall(st);
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
} }
else else
{ {
@ -839,8 +839,8 @@ namespace cv
{ {
float_coeffs[m][n] = coeffs[m][n]; float_coeffs[m][n] = coeffs[m][n];
} }
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st ); coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
} }
//TODO: improve this kernel //TODO: improve this kernel
@ -894,12 +894,12 @@ namespace cv
string s[3] = {"NN", "Linear", "Cubic"}; string s[3] = {"NN", "Linear", "Cubic"};
string kernelName = "warpPerspective" + s[interpolation]; string kernelName = "warpPerspective" + s[interpolation];
if(src.clCxt -> impl -> double_support != 0) if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{ {
cl_int st; cl_int st;
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st ); coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
openCLVerifyCall(st); openCLVerifyCall(st);
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
} }
else else
{ {
@ -908,9 +908,9 @@ namespace cv
for(int n = 0; n < 3; n++) for(int n = 0; n < 3; n++)
float_coeffs[m][n] = coeffs[m][n]; float_coeffs[m][n] = coeffs[m][n];
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st ); coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
openCLVerifyCall(st); openCLVerifyCall(st);
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0)); openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
} }
//TODO: improve this kernel //TODO: improve this kernel
size_t blkSizeX = 16, blkSizeY = 16; size_t blkSizeX = 16, blkSizeY = 16;
@ -1018,7 +1018,7 @@ namespace cv
void integral(const oclMat &src, oclMat &sum, oclMat &sqsum) void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
{ {
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -1192,7 +1192,7 @@ namespace cv
void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
double k, int borderType) double k, int borderType)
{ {
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -1206,7 +1206,7 @@ namespace cv
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType) void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
{ {
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F) if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{ {
CV_Error(CV_GpuNotSupported, "select device don't support double"); CV_Error(CV_GpuNotSupported, "select device don't support double");
} }
@ -1260,7 +1260,7 @@ namespace cv
if( src.depth() != CV_8U || src.oclchannels() != 4 ) if( src.depth() != CV_8U || src.oclchannels() != 4 )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
// if(src.clCxt->impl->double_support == 0) // if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
// { // {
// CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
// } // }
@ -1328,7 +1328,7 @@ namespace cv
if( src.depth() != CV_8U || src.oclchannels() != 4 ) if( src.depth() != CV_8U || src.oclchannels() != 4 )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
// if(src.clCxt->impl->double_support == 0) // if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
// { // {
// CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
// } // }

View File

@ -77,7 +77,7 @@ namespace cv
ProgramCache *programCache = NULL; ProgramCache *programCache = NULL;
DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT; DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
DevMemRW gDeviceMemRW = DEVICE_MEM_R_W; DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
int gDevMemTypeValueMap[5] = {0, int gDevMemTypeValueMap[5] = {0,
CL_MEM_ALLOC_HOST_PTR, CL_MEM_ALLOC_HOST_PTR,
CL_MEM_USE_HOST_PTR, CL_MEM_USE_HOST_PTR,
CL_MEM_COPY_HOST_PTR, CL_MEM_COPY_HOST_PTR,
@ -124,26 +124,8 @@ namespace cv
cacheSize = 0; cacheSize = 0;
} }
////////////////////////Common OpenCL specific calls///////////////
int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
{
rw_type = gDeviceMemRW;
mem_type = gDeviceMemType;
return Context::getContext()->impl->unified_memory;
}
int setDevMemType(DevMemRW rw_type, DevMemType mem_type) struct Info::Impl
{
if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
mem_type == DEVICE_MEM_UHP ||
mem_type == DEVICE_MEM_CHP )
return -1;
gDeviceMemRW = rw_type;
gDeviceMemType = mem_type;
return 0;
}
struct Info::Impl
{ {
cl_platform_id oclplatform; cl_platform_id oclplatform;
std::vector<cl_device_id> devices; std::vector<cl_device_id> devices;
@ -152,18 +134,144 @@ namespace cv
cl_context oclcontext; cl_context oclcontext;
cl_command_queue clCmdQueue; cl_command_queue clCmdQueue;
int devnum; int devnum;
cl_uint maxDimensions;
size_t maxWorkGroupSize; size_t maxWorkGroupSize;
size_t *maxWorkItemSizes; cl_uint maxDimensions; // == maxWorkItemSizes.size()
std::vector<size_t> maxWorkItemSizes;
cl_uint maxComputeUnits; cl_uint maxComputeUnits;
char extra_options[512]; char extra_options[512];
int double_support; int double_support;
int unified_memory; //1 means integrated GPU, otherwise this value is 0
string binpath;
int refcounter;
Impl() Impl()
{ {
refcounter = 1;
oclplatform = 0;
oclcontext = 0;
clCmdQueue = 0;
devnum = -1;
maxComputeUnits = 0;
maxWorkGroupSize = 0;
memset(extra_options, 0, 512); memset(extra_options, 0, 512);
double_support = 0;
unified_memory = 0;
} }
void setDevice(void *ctx, void *q, int devnum);
void release()
{
if(1 == CV_XADD(&refcounter, -1))
{
releaseResources();
delete this;
}
}
Impl* copy()
{
CV_XADD(&refcounter, 1);
return this;
}
private:
Impl(const Impl&);
Impl& operator=(const Impl&);
void releaseResources();
}; };
void Info::Impl::releaseResources()
{
devnum = -1;
if(clCmdQueue)
{
openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
clCmdQueue = 0;
}
if(oclcontext)
{
openCLSafeCall(clReleaseContext(oclcontext));
oclcontext = 0;
}
}
void Info::Impl::setDevice(void *ctx, void *q, int dnum)
{
if((ctx && q) || devnum != dnum)
releaseResources();
CV_Assert(dnum >= 0 && dnum < (int)devices.size());
devnum = dnum;
if(ctx && q)
{
oclcontext = (cl_context)ctx;
clCmdQueue = (cl_command_queue)q;
clRetainContext(oclcontext);
clRetainCommandQueue(clCmdQueue);
}
else
{
cl_int status = 0;
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
openCLVerifyCall(status);
clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
openCLVerifyCall(status);
}
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
maxWorkItemSizes.resize(maxDimensions);
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
cl_bool unfymem = false;
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
unified_memory = unfymem ? 1 : 0;
//initialize extra options for compilation. Currently only fp64 is included.
//Assume 4KB is enough to store all possible extensions.
const int EXT_LEN = 4096 + 1 ;
char extends_set[EXT_LEN];
size_t extends_size;
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
extends_set[EXT_LEN - 1] = 0;
size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
if(fp64_khr != std::string::npos)
{
sprintf(extra_options, "-D DOUBLE_SUPPORT");
double_support = 1;
}
else
{
memset(extra_options, 0, 512);
double_support = 0;
}
}
////////////////////////Common OpenCL specific calls///////////////
int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
{
rw_type = gDeviceMemRW;
mem_type = gDeviceMemType;
return Context::getContext()->impl->unified_memory;
}
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
{
if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
mem_type == DEVICE_MEM_UHP ||
mem_type == DEVICE_MEM_CHP )
return -1;
gDeviceMemRW = rw_type;
gDeviceMemType = mem_type;
return 0;
}
inline int divUp(int total, int grain) inline int divUp(int total, int grain)
{ {
return (total + grain - 1) / grain; return (total + grain - 1) / grain;
@ -171,6 +279,9 @@ namespace cv
int getDevice(std::vector<Info> &oclinfo, int devicetype) int getDevice(std::vector<Info> &oclinfo, int devicetype)
{ {
//TODO: cache oclinfo vector
oclinfo.clear();
switch(devicetype) switch(devicetype)
{ {
case CVCL_DEVICE_TYPE_DEFAULT: case CVCL_DEVICE_TYPE_DEFAULT:
@ -180,125 +291,62 @@ namespace cv
case CVCL_DEVICE_TYPE_ALL: case CVCL_DEVICE_TYPE_ALL:
break; break;
default: default:
CV_Error(CV_GpuApiCallError, "Unkown device type"); return 0;
} }
int devcienums = 0;
// Platform info
cl_int status = 0;
cl_uint numPlatforms;
Info ocltmpinfo;
openCLSafeCall(clGetPlatformIDs(0, NULL, &numPlatforms));
CV_Assert(numPlatforms > 0);
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
openCLSafeCall(clGetPlatformIDs(numPlatforms, platforms, NULL)); // Platform info
cl_uint numPlatforms;
openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
if(numPlatforms < 1) return 0;
std::vector<cl_platform_id> platforms(numPlatforms);
openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
char deviceName[256]; char deviceName[256];
int devcienums = 0;
for (unsigned i = 0; i < numPlatforms; ++i) for (unsigned i = 0; i < numPlatforms; ++i)
{ {
cl_uint numsdev; cl_uint numsdev;
status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev); cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
if(status != CL_DEVICE_NOT_FOUND) if(status != CL_DEVICE_NOT_FOUND)
{
openCLVerifyCall(status); openCLVerifyCall(status);
}
if(numsdev > 0) if(numsdev > 0)
{ {
devcienums += numsdev; devcienums += numsdev;
cl_device_id *devices = new cl_device_id[numsdev]; std::vector<cl_device_id> devices(numsdev);
openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, devices, NULL)); openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
Info ocltmpinfo;
ocltmpinfo.impl->oclplatform = platforms[i]; ocltmpinfo.impl->oclplatform = platforms[i];
for(unsigned j = 0; j < numsdev; j++) for(unsigned j = 0; j < numsdev; ++j)
{ {
ocltmpinfo.impl->devices.push_back(devices[j]); ocltmpinfo.impl->devices.push_back(devices[j]);
openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 256, deviceName, NULL)); openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
ocltmpinfo.impl->devName.push_back(std::string(deviceName)); ocltmpinfo.impl->devName.push_back(deviceName);
ocltmpinfo.DeviceName.push_back(std::string(deviceName)); ocltmpinfo.DeviceName.push_back(deviceName);
} }
delete[] devices;
oclinfo.push_back(ocltmpinfo); oclinfo.push_back(ocltmpinfo);
ocltmpinfo.release();
} }
} }
delete[] platforms;
if(devcienums > 0)
{
setDevice(oclinfo[0]);
}
return devcienums; return devcienums;
} }
static void fillClcontext(Info &oclinfo)
{
//get device information
size_t devnum = oclinfo.impl->devnum;
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), (void *)&oclinfo.impl->maxWorkGroupSize, NULL));
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
sizeof(cl_uint), (void *)&oclinfo.impl->maxDimensions, NULL));
oclinfo.impl->maxWorkItemSizes = new size_t[oclinfo.impl->maxDimensions];
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t)*oclinfo.impl->maxDimensions, (void *)oclinfo.impl->maxWorkItemSizes, NULL));
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_uint), (void *)&oclinfo.impl->maxComputeUnits, NULL));
//initialize extra options for compilation. Currently only fp64 is included.
//Assume 4KB is enough to store all possible extensions.
const int EXT_LEN = 4096 + 1 ;
char extends_set[EXT_LEN];
size_t extends_size;
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_EXTENSIONS,
EXT_LEN, (void *)extends_set, &extends_size));
CV_Assert(extends_size < (size_t)EXT_LEN);
extends_set[EXT_LEN - 1] = 0;
memset(oclinfo.impl->extra_options, 0, 512);
oclinfo.impl->double_support = 0;
int fp64_khr = string(extends_set).find("cl_khr_fp64");
if(fp64_khr >= 0 && fp64_khr < EXT_LEN)
{
sprintf(oclinfo.impl->extra_options , "-D DOUBLE_SUPPORT");
oclinfo.impl -> double_support = 1;
}
Context::setContext(oclinfo);
}
void setDevice(Info &oclinfo, int devnum) void setDevice(Info &oclinfo, int devnum)
{ {
CV_Assert(devnum >= 0); oclinfo.impl->setDevice(0, 0, devnum);
cl_int status = 0; Context::setContext(oclinfo);
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties)(oclinfo.impl->oclplatform), 0
};
oclinfo.impl->devnum = devnum;
oclinfo.impl->oclcontext = clCreateContext(cps, 1, &oclinfo.impl->devices[devnum], NULL, NULL, &status);
openCLVerifyCall(status);
//create the command queue using the first device of the list
oclinfo.impl->clCmdQueue = clCreateCommandQueue(oclinfo.impl->oclcontext, oclinfo.impl->devices[devnum],
CL_QUEUE_PROFILING_ENABLE, &status);
openCLVerifyCall(status);
fillClcontext(oclinfo);
} }
void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum) void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
{ {
CV_Assert(devnum >= 0); oclinfo.impl->setDevice(ctx, q, devnum);
oclinfo.impl->devnum = devnum; Context::setContext(oclinfo);
if(ctx && q)
{
oclinfo.impl->oclcontext = (cl_context)ctx;
oclinfo.impl->clCmdQueue = (cl_command_queue)q;
clRetainContext((cl_context)ctx);
clRetainCommandQueue((cl_command_queue)q);
fillClcontext(oclinfo);
}
} }
void *getoclContext() void *getoclContext()
{ {
return &(Context::getContext()->impl->clContext); return &(Context::getContext()->impl->oclcontext);
} }
void *getoclCommandQueue() void *getoclCommandQueue()
@ -316,7 +364,7 @@ namespace cv
cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size) cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
{ {
cl_int status; cl_int status;
cl_mem buffer = clCreateBuffer(clCxt->impl->clContext, (cl_mem_flags)flag, size, NULL, &status); cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
openCLVerifyCall(status); openCLVerifyCall(status);
return buffer; return buffer;
} }
@ -331,8 +379,7 @@ namespace cv
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type) size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
{ {
cl_int status; cl_int status;
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
*dev_ptr = clCreateBuffer(clCxt->impl->clContext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
widthInBytes * height, 0, &status); widthInBytes * height, 0, &status);
openCLVerifyCall(status); openCLVerifyCall(status);
*pitch = widthInBytes; *pitch = widthInBytes;
@ -340,7 +387,7 @@ namespace cv
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch, const void *src, size_t spitch,
size_t width, size_t height, enum openCLMemcpyKind kind, int channels) size_t width, size_t height, openCLMemcpyKind kind, int channels)
{ {
size_t buffer_origin[3] = {0, 0, 0}; size_t buffer_origin[3] = {0, 0, 0};
size_t host_origin[3] = {0, 0, 0}; size_t host_origin[3] = {0, 0, 0};
@ -398,7 +445,7 @@ namespace cv
void setBinpath(const char *path) void setBinpath(const char *path)
{ {
Context *clcxt = Context::getContext(); Context *clcxt = Context::getContext();
clcxt->impl->Binpath = path; clcxt->impl->binpath = path;
} }
int savetofile(const Context*, cl_program &program, const char *fileName) int savetofile(const Context*, cl_program &program, const char *fileName)
@ -442,11 +489,11 @@ namespace cv
if(NULL != build_options) if(NULL != build_options)
{ {
src_sign << (int64)(*source) << clCxt->impl->clContext << "_" << build_options; src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
} }
else else
{ {
src_sign << (int64)(*source) << clCxt->impl->clContext; src_sign << (int64)(*source) << clCxt->impl->oclcontext;
} }
srcsign = src_sign.str(); srcsign = src_sign.str();
@ -466,24 +513,24 @@ namespace cv
strcat(all_build_options, build_options); strcat(all_build_options, build_options);
if(all_build_options != NULL) if(all_build_options != NULL)
{ {
filename = clCxt->impl->Binpath + kernelName + "_" + clCxt->impl->devName + all_build_options + ".clb"; filename = clCxt->impl->binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
} }
else else
{ {
filename = clCxt->impl->Binpath + kernelName + "_" + clCxt->impl->devName + ".clb"; filename = clCxt->impl->binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
} }
FILE *fp = fopen(filename.c_str(), "rb"); FILE *fp = fopen(filename.c_str(), "rb");
if(fp == NULL || clCxt->impl->Binpath.size() == 0) //we should generate a binary file for the first time. if(fp == NULL || clCxt->impl->binpath.size() == 0) //we should generate a binary file for the first time.
{ {
if(fp != NULL) if(fp != NULL)
fclose(fp); fclose(fp);
program = clCreateProgramWithSource( program = clCreateProgramWithSource(
clCxt->impl->clContext, 1, source, NULL, &status); clCxt->impl->oclcontext, 1, source, NULL, &status);
openCLVerifyCall(status); openCLVerifyCall(status);
status = clBuildProgram(program, 1, &(clCxt->impl->devices), all_build_options, NULL, NULL); status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
if(status == CL_SUCCESS && clCxt->impl->Binpath.size()) if(status == CL_SUCCESS && clCxt->impl->binpath.size())
savetofile(clCxt, program, filename.c_str()); savetofile(clCxt, program, filename.c_str());
} }
else else
@ -495,15 +542,15 @@ namespace cv
CV_Assert(1 == fread(binary, binarySize, 1, fp)); CV_Assert(1 == fread(binary, binarySize, 1, fp));
fclose(fp); fclose(fp);
cl_int status = 0; cl_int status = 0;
program = clCreateProgramWithBinary(clCxt->impl->clContext, program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
1, 1,
&(clCxt->impl->devices), &(clCxt->impl->devices[clCxt->impl->devnum]),
(const size_t *)&binarySize, (const size_t *)&binarySize,
(const unsigned char **)&binary, (const unsigned char **)&binary,
NULL, NULL,
&status); &status);
openCLVerifyCall(status); openCLVerifyCall(status);
status = clBuildProgram(program, 1, &(clCxt->impl->devices), all_build_options, NULL, NULL); status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
delete[] binary; delete[] binary;
} }
@ -515,14 +562,14 @@ namespace cv
char *buildLog = NULL; char *buildLog = NULL;
size_t buildLogSize = 0; size_t buildLogSize = 0;
logStatus = clGetProgramBuildInfo(program, logStatus = clGetProgramBuildInfo(program,
clCxt->impl->devices, CL_PROGRAM_BUILD_LOG, buildLogSize, clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
buildLog, &buildLogSize); buildLog, &buildLogSize);
if(logStatus != CL_SUCCESS) if(logStatus != CL_SUCCESS)
cout << "Failed to build the program and get the build info." << endl; cout << "Failed to build the program and get the build info." << endl;
buildLog = new char[buildLogSize]; buildLog = new char[buildLogSize];
CV_DbgAssert(!!buildLog); CV_DbgAssert(!!buildLog);
memset(buildLog, 0, buildLogSize); memset(buildLog, 0, buildLogSize);
openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices, openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL)); CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
cout << "\n\t\t\tBUILD LOG\n"; cout << "\n\t\t\tBUILD LOG\n";
cout << buildLog << endl; cout << buildLog << endl;
@ -544,13 +591,13 @@ namespace cv
void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads) void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
{ {
size_t kernelWorkGroupSize; size_t kernelWorkGroupSize;
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices, openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
CV_Assert( (localThreads[0] <= clCxt->impl->maxWorkItemSizes[0]) && CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
(localThreads[1] <= clCxt->impl->maxWorkItemSizes[1]) && CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
(localThreads[2] <= clCxt->impl->maxWorkItemSizes[2]) && CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
((localThreads[0] * localThreads[1] * localThreads[2]) <= kernelWorkGroupSize) && CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
(localThreads[0] * localThreads[1] * localThreads[2]) <= clCxt->impl->maxWorkGroupSize); CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
} }
#ifdef PRINT_KERNEL_RUN_TIME #ifdef PRINT_KERNEL_RUN_TIME
@ -664,10 +711,10 @@ namespace cv
cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl; cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
#endif #endif
} }
double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName, double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3], size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options, vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
bool finish, bool measureKernelTime, bool cleanUp) bool finish, bool measureKernelTime, bool cleanUp)
{ {
@ -764,7 +811,7 @@ namespace cv
f.read(str, fileSize); f.read(str, fileSize);
f.close(); f.close();
str[size] = '\0'; str[size] = '\0';
s = str; s = str;
delete[] str; delete[] str;
return 0; return 0;
@ -775,7 +822,7 @@ namespace cv
double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName, double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
size_t globalThreads[3], size_t localThreads[3], size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options, vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
bool finish, bool measureKernelTime, bool cleanUp) bool finish, bool measureKernelTime, bool cleanUp)
{ {
@ -795,8 +842,8 @@ namespace cv
delete []source; delete []source;
return kernelTime; return kernelTime;
} }
cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value, cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
const size_t size) const size_t size)
{ {
int status; int status;
@ -815,142 +862,143 @@ namespace cv
/////////////////////////////OpenCL initialization///////////////// /////////////////////////////OpenCL initialization/////////////////
auto_ptr<Context> Context::clCxt; auto_ptr<Context> Context::clCxt;
int Context::val = 0; int Context::val = 0;
Mutex cs; static Mutex cs;
Context *Context::getContext() Context* Context::getContext()
{ {
if(val == 0) if(*((volatile int*)&val) != 1)
{ {
AutoLock al(cs); AutoLock al(cs);
if( NULL == clCxt.get()) if(*((volatile int*)&val) != 1)
{
if( 0 == clCxt.get())
clCxt.reset(new Context);
std::vector<Info> oclinfo;
CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
oclinfo[0].impl->setDevice(0, 0, 0);
clCxt.get()->impl = oclinfo[0].impl->copy();
*((volatile int*)&val) = 1;
}
}
return clCxt.get();
}
void Context::setContext(Info &oclinfo)
{
AutoLock guard(cs);
if(*((volatile int*)&val) != 1)
{
if( 0 == clCxt.get())
clCxt.reset(new Context); clCxt.reset(new Context);
val = 1; clCxt.get()->impl = oclinfo.impl->copy();
return clCxt.get();
*((volatile int*)&val) = 1;
} }
else else
{ {
return clCxt.get(); clCxt.get()->impl->release();
clCxt.get()->impl = oclinfo.impl->copy();
} }
} }
void Context::setContext(Info &oclinfo)
{
Context *clcxt = getContext();
clcxt->impl->clContext = oclinfo.impl->oclcontext;
clcxt->impl->clCmdQueue = oclinfo.impl->clCmdQueue;
clcxt->impl->devices = oclinfo.impl->devices[oclinfo.impl->devnum];
clcxt->impl->devName = oclinfo.impl->devName[oclinfo.impl->devnum];
clcxt->impl->maxDimensions = oclinfo.impl->maxDimensions;
clcxt->impl->maxWorkGroupSize = oclinfo.impl->maxWorkGroupSize;
for(size_t i=0; i<clcxt->impl->maxDimensions && i<4; i++)
clcxt->impl->maxWorkItemSizes[i] = oclinfo.impl->maxWorkItemSizes[i];
clcxt->impl->maxComputeUnits = oclinfo.impl->maxComputeUnits;
clcxt->impl->double_support = oclinfo.impl->double_support;
//extra options to recognize compiler options
memcpy(clcxt->impl->extra_options, oclinfo.impl->extra_options, 512);
cl_bool unfymem = false;
openCLSafeCall(clGetDeviceInfo(clcxt->impl->devices, CL_DEVICE_HOST_UNIFIED_MEMORY,
sizeof(cl_bool), (void *)&unfymem, NULL));
if(unfymem)
clcxt->impl->unified_memory = 1;
}
Context::Context() Context::Context()
{ {
impl = new Impl; impl = 0;
//Information of the OpenCL context
impl->clContext = NULL;
impl->clCmdQueue = NULL;
impl->devices = NULL;
impl->maxDimensions = 0;
impl->maxWorkGroupSize = 0;
for(int i=0; i<4; i++)
impl->maxWorkItemSizes[i] = 0;
impl->maxComputeUnits = 0;
impl->double_support = 0;
//extra options to recognize vendor specific fp64 extensions
memset(impl->extra_options, 0, 512);
impl->unified_memory = 0;
programCache = ProgramCache::getProgramCache(); programCache = ProgramCache::getProgramCache();
} }
Context::~Context() Context::~Context()
{ {
delete impl; release();
}
void Context::release()
{
if (impl)
impl->release();
programCache->releaseProgram(); programCache->releaseProgram();
} }
bool Context::supportsFeature(int ftype)
{
switch(ftype)
{
case CL_DOUBLE:
return impl->double_support == 1;
case CL_UNIFIED_MEM:
return impl->unified_memory == 1;
default:
return false;
}
}
size_t Context::computeUnits()
{
return impl->maxComputeUnits;
}
void* Context::oclContext()
{
return impl->oclcontext;
}
void* Context::oclCommandQueue()
{
return impl->clCmdQueue;
}
Info::Info() Info::Info()
{ {
impl = new Impl; impl = new Impl;
impl->oclplatform = 0;
impl->oclcontext = 0;
impl->clCmdQueue = 0;
impl->devnum = 0;
impl->maxDimensions = 0;
impl->maxWorkGroupSize = 0;
impl->maxWorkItemSizes = 0;
impl->maxComputeUnits = 0;
impl->double_support = 0;
//extra_options = 0;
} }
void Info::release() void Info::release()
{ {
fft_teardown(); fft_teardown();
if(impl->oclplatform) impl->release();
{ impl = new Impl;
impl->oclplatform = 0;
}
if(impl->clCmdQueue)
{
openCLSafeCall(clReleaseCommandQueue(impl->clCmdQueue));
}
ProgramCache::getProgramCache()->releaseProgram();
if(impl->oclcontext)
{
openCLSafeCall(clReleaseContext(impl->oclcontext));
}
if(impl->maxWorkItemSizes)
{
delete[] impl->maxWorkItemSizes;
impl->maxWorkItemSizes = 0;
}
//if(extra_options)
//{
// delete[] extra_options;
// extra_options = 0;
//}
impl->devices.clear();
impl->devName.clear();
DeviceName.clear(); DeviceName.clear();
} }
Info::~Info() Info::~Info()
{ {
release(); fft_teardown();
delete impl; impl->release();
} }
Info &Info::operator = (const Info &m) Info &Info::operator = (const Info &m)
{ {
impl->oclplatform = m.impl->oclplatform; impl->release();
impl->oclcontext = m.impl->oclcontext; impl = m.impl->copy();
impl->clCmdQueue = m.impl->clCmdQueue; DeviceName = m.DeviceName;
impl->devnum = m.impl->devnum;
impl->maxDimensions = m.impl->maxDimensions;
impl->maxWorkGroupSize = m.impl->maxWorkGroupSize;
impl->maxWorkItemSizes = m.impl->maxWorkItemSizes;
impl->maxComputeUnits = m.impl->maxComputeUnits;
impl->double_support = m.impl->double_support;
memcpy(impl->extra_options, m.impl->extra_options, 512);
for(size_t i = 0; i < m.impl->devices.size(); i++)
{
impl->devices.push_back(m.impl->devices[i]);
impl->devName.push_back(m.impl->devName[i]);
DeviceName.push_back(m.DeviceName[i]);
}
return *this; return *this;
} }
Info::Info(const Info &m) Info::Info(const Info &m)
{ {
impl = new Impl; impl = m.impl->copy();
*this = m; DeviceName = m.DeviceName;
} }
}//namespace ocl }//namespace ocl
}//namespace cv }//namespace cv
#if defined BUILD_SHARED_LIBS && defined CVAPI_EXPORTS && defined WIN32 && !defined WINCE
#include <windows.h>
BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID );
BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID )
{
if( fdwReason == DLL_PROCESS_DETACH )
{
// application hangs if call clReleaseCommandQueue here, so release context only
// without context release application hangs as well
cl_context ctx = (cl_context)getoclContext();
if(ctx)
openCLSafeCall(clReleaseContext(ctx));
}
return TRUE;
}
#endif

View File

@ -43,9 +43,7 @@
// //
//M*/ //M*/
#include <iomanip>
#include "precomp.hpp" #include "precomp.hpp"
#include "mcwutil.hpp"
using namespace std; using namespace std;
using namespace cv; using namespace cv;

View File

@ -1,865 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
#define MAX_FLOAT 1e7f
int bit1Count(float x)
{
int c = 0;
int ix = (int)x;
for (int i = 0 ; i < 32 ; i++)
{
c += ix & 0x1;
ix >>= 1;
}
return (float)c;
}
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
local size: dim0 is block_size, dim1 is block_size.
*/
__kernel void BruteForceMatch_UnrollMatch(
__global float *query,
__global float *train,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * max_desc_len;
int queryIdx = groupidx * block_size + lidy;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
// loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
{
//bestImgIdx = imgIdx;
myBestDistance = result;
myBestTrainIdx = trainIdx;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)(sharebuffer);
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//find BestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
{
myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
__kernel void BruteForceMatch_Match(
__global float *query,
__global float *train,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
// loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
//Dist dist;
float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{
const int loadx = lidx + i * block_size;
//load query and train into local memory
s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
{
//myBestImgidx = imgIdx;
myBestDistance = result;
myBestTrainIdx = trainIdx;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//findBestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
{
myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
//radius_unrollmatch
__kernel void BruteForceMatch_RadiusUnrollMatch(
__global float *query,
__global float *train,
float maxDistance,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int bestTrainIdx_cols,
int step,
int ostep,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; ++j)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; ++j)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; ++j)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if (ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
}
//radius_match
__kernel void BruteForceMatch_RadiusMatch(
__global float *query,
__global float *train,
float maxDistance,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int bestTrainIdx_cols,
int step,
int ostep,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; ++j)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; ++j)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; ++j)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if (ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
}
__kernel void BruteForceMatch_knnUnrollMatch(
__global float *query,
__global float *train,
//__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * max_desc_len;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
//loopUnrolledCached
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
const int loadX = lidx + i * block_size;
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows)
{
if (result < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx;
}
else if (result < myBestDistance2)
{
myBestDistance2 = result;
myBestTrainIdx2 = trainIdx;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
local float *s_distance = (local float *)sharebuffer;
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
// find BestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT;
int bestTrainIdx1 = -1;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance1)
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
}
else if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
}
__kernel void BruteForceMatch_knnMatch(
__global float *query,
__global float *train,
//__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * block_size;
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
//loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0.0f;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{
const int loadx = lidx + i * block_size;
//load query and train into local memory
s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
{
if (result < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx;
}
else if (result < myBestDistance2)
{
myBestDistance2 = result;
myBestTrainIdx2 = trainIdx;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//findBestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT;
int bestTrainIdx1 = -1;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance1)
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
}
else if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
}
kernel void BruteForceMatch_calcDistanceUnrolled(
__global float *query,
__global float *train,
//__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType)
{
/* Todo */
}
kernel void BruteForceMatch_calcDistance(
__global float *query,
__global float *train,
//__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType)
{
/* Todo */
}
kernel void BruteForceMatch_findBestMatch(
__global float *allDist,
__global int *bestTrainIdx,
__global float *bestDistance,
int k,
int block_size
)
{
/* Todo */
}

View File

@ -190,7 +190,7 @@ void cv::ocl::oclMat::upload(const Mat &m)
int pitch = wholeSize.width * 3 * m.elemSize1(); int pitch = wholeSize.width * 3 * m.elemSize1();
int tail_padding = m.elemSize1() * 3072; int tail_padding = m.elemSize1() * 3072;
int err; int err;
cl_mem temp = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE, cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
(pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err); (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
openCLVerifyCall(err); openCLVerifyCall(err);
@ -242,7 +242,7 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
int pitch = wholecols * 3 * m.elemSize1(); int pitch = wholecols * 3 * m.elemSize1();
int tail_padding = m.elemSize1() * 3072; int tail_padding = m.elemSize1() * 3072;
int err; int err;
cl_mem temp = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE, cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
(pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err); (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
openCLVerifyCall(err); openCLVerifyCall(err);
@ -595,7 +595,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
#ifdef CL_VERSION_1_2 #ifdef CL_VERSION_1_2
if(dst.offset == 0 && dst.cols == dst.wholecols) if(dst.offset == 0 && dst.cols == dst.wholecols)
{ {
clEnqueueFillBuffer(dst.clCxt->impl->clCmdQueue, (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL); clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
} }
else else
{ {

View File

@ -43,17 +43,14 @@
// //
//M*/ //M*/
#include "mcwutil.hpp" #include "precomp.hpp"
#if defined (HAVE_OPENCL)
#ifndef CL_VERSION_1_2 #ifndef CL_VERSION_1_2
#define CL_VERSION_1_2 0 #define CL_VERSION_1_2 0
#endif #endif
using namespace std; using namespace std;
namespace cv namespace cv
{ {
namespace ocl namespace ocl
@ -94,15 +91,15 @@ namespace cv
for(size_t i = 0; i < args.size(); i ++) for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, globalThreads,
localThreads, 0, NULL, NULL)); localThreads, 0, NULL, NULL));
switch(finish_mode) switch(finish_mode)
{ {
case CLFINISH: case CLFINISH:
clFinish(clCxt->impl->clCmdQueue); clFinish((cl_command_queue)clCxt->oclCommandQueue());
case CLFLUSH: case CLFLUSH:
clFlush(clCxt->impl->clCmdQueue); clFlush((cl_command_queue)clCxt->oclCommandQueue());
break; break;
case DISABLE: case DISABLE:
default: default:
@ -126,7 +123,7 @@ namespace cv
openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
build_options, finish_mode); build_options, finish_mode);
} }
cl_mem bindTexture(const oclMat &mat) cl_mem bindTexture(const oclMat &mat)
{ {
cl_mem texture; cl_mem texture;
@ -177,10 +174,10 @@ namespace cv
desc.buffer = NULL; desc.buffer = NULL;
desc.num_mip_levels = 0; desc.num_mip_levels = 0;
desc.num_samples = 0; desc.num_samples = 0;
texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
#else #else
texture = clCreateImage2D( texture = clCreateImage2D(
mat.clCxt->impl->clContext, (cl_context)mat.clCxt->oclContext(),
CL_MEM_READ_WRITE, CL_MEM_READ_WRITE,
&format, &format,
mat.cols, mat.cols,
@ -195,10 +192,10 @@ namespace cv
cl_mem devData; cl_mem devData;
if (mat.cols * mat.elemSize() != mat.step) if (mat.cols * mat.elemSize() != mat.step)
{ {
devData = clCreateBuffer(mat.clCxt->impl->clContext, CL_MEM_READ_ONLY, mat.cols * mat.rows devData = clCreateBuffer((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_ONLY, mat.cols * mat.rows
* mat.elemSize(), NULL, NULL); * mat.elemSize(), NULL, NULL);
const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1}; const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
clEnqueueCopyBufferRect(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, devData, origin, origin, clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL); regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
} }
else else
@ -206,10 +203,10 @@ namespace cv
devData = (cl_mem)mat.data; devData = (cl_mem)mat.data;
} }
clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, devData, texture, 0, origin, region, 0, NULL, 0); clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
if ((mat.cols * mat.elemSize() != mat.step)) if ((mat.cols * mat.elemSize() != mat.step))
{ {
clFinish(mat.clCxt->impl->clCmdQueue); clFinish((cl_command_queue)mat.clCxt->oclCommandQueue());
clReleaseMemObject(devData); clReleaseMemObject(devData);
} }
@ -234,7 +231,7 @@ namespace cv
try try
{ {
cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func"); cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
_support = true; //_support = true;
} }
catch (const cv::Exception& e) catch (const cv::Exception& e)
{ {
@ -254,4 +251,3 @@ namespace cv
}//namespace ocl }//namespace ocl
}//namespace cv }//namespace cv
#endif

View File

@ -1,81 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef _OPENCV_MCWUTIL_
#define _OPENCV_MCWUTIL_
#include "precomp.hpp"
using namespace std;
namespace cv
{
namespace ocl
{
enum FLUSH_MODE
{
CLFINISH = 0,
CLFLUSH,
DISABLE
};
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
// bind oclMat to OpenCL image textures
// note:
// 1. there is no memory management. User need to explicitly release the resource
// 2. for faster clamping, there is no buffer padding for the constructed texture
cl_mem bindTexture(const oclMat &mat);
void releaseTexture(cl_mem& texture);
// returns whether the current context supports image2d_t format or not
bool support_image2d(Context *clCxt = Context::getContext());
}//namespace ocl
}//namespace cv
#endif //_OPENCV_MCWUTIL_

View File

@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
if (!cv::ocl::Context::getContext()->impl->double_support && is_float) if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
{ {
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
} }
@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
cv::Mat dst(dst_a); cv::Mat dst(dst_a);
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
if (!cv::ocl::Context::getContext()->impl->double_support) if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
{ {
for (int i = 0; i < contour->total; ++i) for (int i = 0; i < contour->total; ++i)
{ {

View File

@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data ,src2_data; uchar4 src1_data ,src2_data;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3) #define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3) #define dst_align ((dst_offset >> bitOfInt) & 3)
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));

View File

@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
char4 src2_data = vload4(0, src2 + src2_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
} }
} }
#endif #endif

View File

@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
} }
} }
#endif #endif

View File

@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
} }
} }
#endif #endif

View File

@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data; uchar4 tmp_data = ~ src1_data;
/* if(src1_index < 0) /* if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
{ {
int src_index = mad24(y, src_step, (x << 3) + src_offset); int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 data; char8 data;
data = *((__global char8 *)((__global char *)src + src_index)); data = *((__global char8 *)((__global char *)src + src_index));
data = ~ data; data = ~ data;
*((__global char8 *)((__global char *)dst + dst_index)) = data; *((__global char8 *)((__global char *)dst + dst_index)) = data;
} }
} }
#endif #endif

View File

@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1
} }
} }
#endif #endif

View File

@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
} }
} }
#endif #endif

View File

@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
} }
} }
#endif #endif

View File

@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
} }
} }
#endif #endif

View File

@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
char4 src2_data = vload4(0, src2 + src2_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src
} }
} }
#endif #endif

View File

@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
} }
} }
#endif #endif

View File

@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
} }
} }
#endif #endif

View File

@ -63,31 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -115,29 +115,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -166,32 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -215,32 +215,32 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -266,22 +266,22 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0) float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -308,29 +308,29 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -359,31 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -410,31 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -463,29 +463,29 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -512,31 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -561,29 +561,29 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -610,29 +610,29 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -661,30 +661,30 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -715,30 +715,30 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
@ -770,30 +770,30 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -821,30 +821,30 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@ -870,30 +870,30 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@ -921,28 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3)& 3) #define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -954,4 +954,3 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
} }
} }
#endif #endif

View File

@ -59,29 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -111,29 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -163,29 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -211,30 +211,30 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -260,28 +260,28 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -307,29 +307,29 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -344,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
} }
#endif #endif
/***********************************Compare LT*******************************/ /***********************************Compare LT*******************************/
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
@ -359,29 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -411,30 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -464,29 +464,29 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -513,34 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -565,29 +565,29 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -614,29 +614,29 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -665,29 +665,29 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -718,29 +718,29 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -771,29 +771,29 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@ -820,29 +820,29 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data =convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
@ -868,28 +868,28 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@ -916,29 +916,29 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3)& 3) #define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@ -952,5 +952,3 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
} }
} }
#endif #endif

View File

@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
} }
} }
#endif #endif

View File

@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
src1_data.s01234567 = src1_data.s45670123; src1_data.s01234567 = src1_data.s45670123;
if(src1_index== -2) if(src1_index== -2)
src1_data.s01234567 = src1_data.s23456701; src1_data.s01234567 = src1_data.s23456701;
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));

View File

@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]); dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
} }
} }

View File

@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el
dst[gid + groupnum] = localmem_max[0]; dst[gid + groupnum] = localmem_max[0];
} }
} }

View File

@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
dst[gid] = localmem_sum[0]; dst[gid] = localmem_sum[0];
} }
} }

View File

@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
dst[gid*3+2] = localmem_sum3[0]; dst[gid*3+2] = localmem_sum3[0];
} }
} }

View File

@ -15,7 +15,7 @@
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// @Authors // @Authors
// Liu Liujun, liujun@multicorewareinc.com // Liu Liujun, liujun@multicorewareinc.com
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0(
int pos = mad24(idy,istep >> 2,idx); int pos = mad24(idy,istep >> 2,idx);
int wpos = mad24(idy,wstep >> 2,idx); int wpos = mad24(idy,wstep >> 2,idx);
float4 w1 = weight1[wpos], w2 = weight2[wpos]; float4 w1 = weight1[wpos], w2 = weight2[wpos];
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
} }
} }
@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0(
int wpos = mad24(idy,wstep, idx); int wpos = mad24(idy,wstep, idx);
float w1 = weight1[wpos]; float w1 = weight1[wpos];
float w2 = weight2[wpos]; float w2 = weight2[wpos];
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
} }
} }
@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5(
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
} }
} }

View File

@ -0,0 +1,865 @@
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
#define MAX_FLOAT 1e7f
int bit1Count(float x)
{
int c = 0;
int ix = (int)x;
for (int i = 0 ; i < 32 ; i++)
{
c += ix & 0x1;
ix >>= 1;
}
return (float)c;
}
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
local size: dim0 is block_size, dim1 is block_size.
*/
__kernel void BruteForceMatch_UnrollMatch(
__global float *query,
__global float *train,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * max_desc_len;
int queryIdx = groupidx * block_size + lidy;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
// loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
{
//bestImgIdx = imgIdx;
myBestDistance = result;
myBestTrainIdx = trainIdx;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)(sharebuffer);
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//find BestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
{
myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
__kernel void BruteForceMatch_Match(
__global float *query,
__global float *train,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
// loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
//Dist dist;
float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{
const int loadx = lidx + i * block_size;
//load query and train into local memory
s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
{
//myBestImgidx = imgIdx;
myBestDistance = result;
myBestTrainIdx = trainIdx;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//findBestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
{
myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
//radius_unrollmatch
__kernel void BruteForceMatch_RadiusUnrollMatch(
__global float *query,
__global float *train,
float maxDistance,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int bestTrainIdx_cols,
int step,
int ostep,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; ++j)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; ++j)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; ++j)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if (ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
}
//radius_match
__kernel void BruteForceMatch_RadiusMatch(
__global float *query,
__global float *train,
float maxDistance,
//__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int bestTrainIdx_cols,
int step,
int ostep,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; ++j)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; ++j)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; ++j)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if (ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
}
__kernel void BruteForceMatch_knnUnrollMatch(
__global float *query,
__global float *train,
//__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * max_desc_len;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
//loopUnrolledCached
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
const int loadX = lidx + i * block_size;
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows)
{
if (result < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx;
}
else if (result < myBestDistance2)
{
myBestDistance2 = result;
myBestTrainIdx2 = trainIdx;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
local float *s_distance = (local float *)sharebuffer;
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
// find BestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT;
int bestTrainIdx1 = -1;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance1)
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
}
else if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
}
__kernel void BruteForceMatch_knnMatch(
__global float *query,
__global float *train,
//__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * block_size;
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
//loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0.0f;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{
const int loadx = lidx + i * block_size;
//load query and train into local memory
s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch (distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
{
if (result < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx;
}
else if (result < myBestDistance2)
{
myBestDistance2 = result;
myBestTrainIdx2 = trainIdx;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//findBestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT;
int bestTrainIdx1 = -1;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance1)
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
}
else if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
}
kernel void BruteForceMatch_calcDistanceUnrolled(
__global float *query,
__global float *train,
//__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType)
{
/* Todo */
}
kernel void BruteForceMatch_calcDistance(
__global float *query,
__global float *train,
//__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType)
{
/* Todo */
}
kernel void BruteForceMatch_findBestMatch(
__global float *allDist,
__global int *bestTrainIdx,
__global float *bestDistance,
int k,
int block_size
)
{
/* Todo */
}

View File

@ -234,4 +234,3 @@ __kernel
map_y[y * step_y + x] = ycoo; map_y[y * step_y + x] = ycoo;
} }
} }

View File

@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
dst[start_addr] = sum; dst[start_addr] = sum;
} }
} }

View File

@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
newnode[counter].alpha[0] = t1.alpha[0]; newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1]; newnode[counter].alpha[1] = t1.alpha[1];
} }

View File

@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
dst[gy*(dst_step >> 2)+gx] = res; dst[gy*(dst_step >> 2)+gx] = res;
} }
} }

View File

@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
} }
} }
*/ */

Some files were not shown because too many files have changed in this diff Show More