Merge pull request #907 from SpecLad:master
This commit is contained in:
commit
b5c013682b
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
vendored
BIN
3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
vendored
BIN
3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/mips/libnative_camera_r4.0.3.so
vendored
BIN
3rdparty/lib/mips/libnative_camera_r4.0.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/mips/libnative_camera_r4.1.1.so
vendored
BIN
3rdparty/lib/mips/libnative_camera_r4.1.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/mips/libnative_camera_r4.2.0.so
vendored
BIN
3rdparty/lib/mips/libnative_camera_r4.2.0.so
vendored
Binary file not shown.
BIN
3rdparty/lib/x86/libnative_camera_r2.3.3.so
vendored
BIN
3rdparty/lib/x86/libnative_camera_r2.3.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/x86/libnative_camera_r3.0.1.so
vendored
BIN
3rdparty/lib/x86/libnative_camera_r3.0.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/x86/libnative_camera_r4.0.3.so
vendored
BIN
3rdparty/lib/x86/libnative_camera_r4.0.3.so
vendored
Binary file not shown.
BIN
3rdparty/lib/x86/libnative_camera_r4.1.1.so
vendored
BIN
3rdparty/lib/x86/libnative_camera_r4.1.1.so
vendored
Binary file not shown.
BIN
3rdparty/lib/x86/libnative_camera_r4.2.0.so
vendored
BIN
3rdparty/lib/x86/libnative_camera_r4.2.0.so
vendored
Binary file not shown.
5
3rdparty/libjasper/CMakeLists.txt
vendored
5
3rdparty/libjasper/CMakeLists.txt
vendored
@ -23,8 +23,8 @@ if(WIN32 AND NOT MINGW)
|
||||
add_definitions(-DJAS_WIN_MSVC_BUILD)
|
||||
endif(WIN32 AND NOT MINGW)
|
||||
|
||||
ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized
|
||||
-Wmissing-prototypes -Wmissing-declarations -Wunused -Wshadow
|
||||
ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized -Wmissing-prototypes
|
||||
-Wno-unused-but-set-parameter -Wmissing-declarations -Wunused -Wshadow
|
||||
-Wsign-compare -Wstrict-overflow)
|
||||
ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
|
||||
ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4018 /wd4101 /wd4244 /wd4267 /wd4715) # vs2005
|
||||
@ -49,4 +49,3 @@ endif()
|
||||
if(NOT BUILD_SHARED_LIBS)
|
||||
install(TARGETS ${JASPER_LIBRARY} ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
|
||||
endif()
|
||||
|
||||
|
@ -595,12 +595,15 @@ endif()
|
||||
status("")
|
||||
status(" GUI: ")
|
||||
|
||||
if(HAVE_QT)
|
||||
if(HAVE_QT5)
|
||||
status(" QT 5.x:" HAVE_QT THEN "YES (ver ${Qt5Core_VERSION_STRING})" ELSE NO)
|
||||
status(" QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${Qt5OpenGL_LIBRARIES} ${Qt5OpenGL_VERSION_STRING})" ELSE NO)
|
||||
elseif(HAVE_QT)
|
||||
status(" QT 4.x:" HAVE_QT THEN "YES (ver ${QT_VERSION_MAJOR}.${QT_VERSION_MINOR}.${QT_VERSION_PATCH} ${QT_EDITION})" ELSE NO)
|
||||
status(" QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${QT_QTOPENGL_LIBRARY})" ELSE NO)
|
||||
else()
|
||||
if(DEFINED WITH_QT)
|
||||
status(" QT 4.x:" NO)
|
||||
status(" QT:" NO)
|
||||
endif()
|
||||
if(DEFINED WITH_WIN32UI)
|
||||
status(" Win32 UI:" HAVE_WIN32UI THEN YES ELSE NO)
|
||||
|
@ -176,7 +176,8 @@ macro(android_get_compatible_target VAR)
|
||||
endmacro()
|
||||
|
||||
unset(__android_project_chain CACHE)
|
||||
#add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11)
|
||||
|
||||
# add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11)
|
||||
macro(add_android_project target path)
|
||||
# parse arguments
|
||||
set(android_proj_arglist NATIVE_DEPS LIBRARY_DEPS SDK_TARGET IGNORE_JAVA IGNORE_MANIFEST)
|
||||
@ -212,6 +213,16 @@ macro(add_android_project target path)
|
||||
ocv_check_dependencies(${android_proj_NATIVE_DEPS} opencv_java)
|
||||
endif()
|
||||
|
||||
if(EXISTS "${path}/jni/Android.mk" )
|
||||
# find if native_app_glue is used
|
||||
file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
|
||||
if(NATIVE_APP_GLUE)
|
||||
if(ANDROID_NATIVE_API_LEVEL LESS 9 OR NOT EXISTS "${ANDROID_NDK}/sources/android/native_app_glue")
|
||||
set(OCV_DEPENDENCIES_FOUND FALSE)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(OCV_DEPENDENCIES_FOUND AND android_proj_sdk_target AND ANDROID_EXECUTABLE AND ANT_EXECUTABLE AND ANDROID_TOOLS_Pkg_Revision GREATER 13 AND EXISTS "${path}/${ANDROID_MANIFEST_FILE}")
|
||||
|
||||
project(${target})
|
||||
@ -268,9 +279,6 @@ macro(add_android_project target path)
|
||||
file(STRINGS "${path}/jni/Android.mk" JNI_LIB_NAME REGEX "LOCAL_MODULE[ ]*:=[ ]*.*" )
|
||||
string(REGEX REPLACE "LOCAL_MODULE[ ]*:=[ ]*([a-zA-Z_][a-zA-Z_0-9]*)[ ]*" "\\1" JNI_LIB_NAME "${JNI_LIB_NAME}")
|
||||
|
||||
# find using of native app glue to determine native activity
|
||||
file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
|
||||
|
||||
if(JNI_LIB_NAME)
|
||||
ocv_include_modules_recurse(${android_proj_NATIVE_DEPS})
|
||||
ocv_include_directories("${path}/jni")
|
||||
@ -291,9 +299,9 @@ macro(add_android_project target path)
|
||||
)
|
||||
|
||||
get_target_property(android_proj_jni_location "${JNI_LIB_NAME}" LOCATION)
|
||||
if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
|
||||
add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}")
|
||||
endif()
|
||||
if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
|
||||
add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -101,7 +101,7 @@ endif()
|
||||
|
||||
if(MSVC64 OR MINGW64)
|
||||
set(X86_64 1)
|
||||
elseif(MSVC AND NOT CMAKE_CROSSCOMPILING)
|
||||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
set(X86_64 1)
|
||||
|
@ -20,12 +20,6 @@ else(APPLE)
|
||||
DOC "OpenCL include directory"
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
find_path(OPENCL_INCLUDE_DIR
|
||||
NAMES OpenCL/cl.h CL/cl.h
|
||||
HINTS ${OPENCL_ROOT_DIR}
|
||||
PATH_SUFFIXES include include/nvidia-current
|
||||
DOC "OpenCL include directory")
|
||||
|
||||
if (X86_64)
|
||||
set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
|
||||
elseif (X86)
|
||||
@ -39,12 +33,6 @@ else(APPLE)
|
||||
DOC "OpenCL library"
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
find_library(OPENCL_LIBRARY
|
||||
NAMES OpenCL
|
||||
HINTS ${OPENCL_ROOT_DIR}
|
||||
PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES}
|
||||
DOC "OpenCL library")
|
||||
|
||||
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR )
|
||||
|
@ -102,18 +102,12 @@ if(PYTHON_EXECUTABLE)
|
||||
if(BUILD_DOCS)
|
||||
find_host_program(SPHINX_BUILD sphinx-build)
|
||||
if(SPHINX_BUILD)
|
||||
if(UNIX)
|
||||
execute_process(COMMAND sh -c "${SPHINX_BUILD} -_ 2>&1 | sed -ne 1p"
|
||||
RESULT_VARIABLE SPHINX_PROCESS
|
||||
OUTPUT_VARIABLE SPHINX_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
else()
|
||||
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import sphinx; print sphinx.__version__"
|
||||
RESULT_VARIABLE SPHINX_PROCESS
|
||||
OUTPUT_VARIABLE SPHINX_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
endif()
|
||||
if(SPHINX_PROCESS EQUAL 0)
|
||||
execute_process(COMMAND "${SPHINX_BUILD}"
|
||||
OUTPUT_QUIET
|
||||
ERROR_VARIABLE SPHINX_OUTPUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(SPHINX_OUTPUT MATCHES "^Sphinx v([0-9][^ \n]*)")
|
||||
set(SPHINX_VERSION "${CMAKE_MATCH_1}")
|
||||
set(HAVE_SPHINX 1)
|
||||
message(STATUS "Found Sphinx ${SPHINX_VERSION}: ${SPHINX_BUILD}")
|
||||
endif()
|
||||
|
@ -13,12 +13,31 @@ if(WITH_WIN32UI)
|
||||
endif(WITH_WIN32UI)
|
||||
|
||||
# --- QT4 ---
|
||||
ocv_clear_vars(HAVE_QT)
|
||||
ocv_clear_vars(HAVE_QT HAVE_QT5)
|
||||
if(WITH_QT)
|
||||
find_package(Qt4)
|
||||
if(QT4_FOUND)
|
||||
set(HAVE_QT TRUE)
|
||||
add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
|
||||
if(NOT CMAKE_VERSION VERSION_LESS 2.8.3 AND NOT WITH_QT EQUAL 4)
|
||||
find_package(Qt5Core)
|
||||
find_package(Qt5Gui)
|
||||
find_package(Qt5Widgets)
|
||||
find_package(Qt5Test)
|
||||
find_package(Qt5Concurrent)
|
||||
if(Qt5Core_FOUND AND Qt5Gui_FOUND AND Qt5Widgets_FOUND AND Qt5Test_FOUND AND Qt5Concurrent_FOUND)
|
||||
set(HAVE_QT5 ON)
|
||||
set(HAVE_QT ON)
|
||||
add_definitions(-DHAVE_QT)
|
||||
find_package(Qt5OpenGL)
|
||||
if(Qt5OpenGL_FOUND)
|
||||
set(QT_QTOPENGL_FOUND ON)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT HAVE_QT)
|
||||
find_package(Qt4)
|
||||
if(QT4_FOUND)
|
||||
set(HAVE_QT TRUE)
|
||||
add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -362,6 +362,9 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
|
||||
typedef sp<Camera> (*Android23ConnectFuncType)(int);
|
||||
typedef sp<Camera> (*Android3DConnectFuncType)(int, int);
|
||||
|
||||
const int BACK_CAMERA_INDEX = 99;
|
||||
const int FRONT_CAMERA_INDEX = 98;
|
||||
|
||||
enum {
|
||||
CAMERA_SUPPORT_MODE_2D = 0x01, /* Camera Sensor supports 2D mode. */
|
||||
CAMERA_SUPPORT_MODE_3D = 0x02, /* Camera Sensor supports 3D mode. */
|
||||
@ -373,7 +376,51 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
|
||||
const char Android23ConnectName[] = "_ZN7android6Camera7connectEi";
|
||||
const char Android3DConnectName[] = "_ZN7android6Camera7connectEii";
|
||||
|
||||
LOGD("CameraHandler::initCameraConnect(%p, %d, %p, %p)", callback, cameraId, userData, prevCameraParameters);
|
||||
int localCameraIndex = cameraId;
|
||||
|
||||
#if !defined(ANDROID_r2_2_0)
|
||||
if (cameraId == BACK_CAMERA_INDEX)
|
||||
{
|
||||
LOGD("Back camera selected");
|
||||
for (int i = 0; i < Camera::getNumberOfCameras(); i++)
|
||||
{
|
||||
CameraInfo info;
|
||||
Camera::getCameraInfo(i, &info);
|
||||
if (info.facing == CAMERA_FACING_BACK)
|
||||
{
|
||||
localCameraIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (cameraId == FRONT_CAMERA_INDEX)
|
||||
{
|
||||
LOGD("Front camera selected");
|
||||
for (int i = 0; i < Camera::getNumberOfCameras(); i++)
|
||||
{
|
||||
CameraInfo info;
|
||||
Camera::getCameraInfo(i, &info);
|
||||
if (info.facing == CAMERA_FACING_FRONT)
|
||||
{
|
||||
localCameraIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (localCameraIndex == BACK_CAMERA_INDEX)
|
||||
{
|
||||
LOGE("Back camera not found!");
|
||||
return NULL;
|
||||
}
|
||||
else if (localCameraIndex == FRONT_CAMERA_INDEX)
|
||||
{
|
||||
LOGE("Front camera not found!");
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
LOGD("CameraHandler::initCameraConnect(%p, %d, %p, %p)", callback, localCameraIndex, userData, prevCameraParameters);
|
||||
|
||||
sp<Camera> camera = 0;
|
||||
|
||||
@ -381,8 +428,8 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
|
||||
|
||||
if (!CameraHALHandle)
|
||||
{
|
||||
LOGE("Cannot link to \"libcamera_client.so\"");
|
||||
return NULL;
|
||||
LOGE("Cannot link to \"libcamera_client.so\"");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// reset errors
|
||||
@ -390,24 +437,24 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
|
||||
|
||||
if (Android22ConnectFuncType Android22Connect = (Android22ConnectFuncType)dlsym(CameraHALHandle, Android22ConnectName))
|
||||
{
|
||||
LOGD("Connecting to CameraService v 2.2");
|
||||
camera = Android22Connect();
|
||||
LOGD("Connecting to CameraService v 2.2");
|
||||
camera = Android22Connect();
|
||||
}
|
||||
else if (Android23ConnectFuncType Android23Connect = (Android23ConnectFuncType)dlsym(CameraHALHandle, Android23ConnectName))
|
||||
{
|
||||
LOGD("Connecting to CameraService v 2.3");
|
||||
camera = Android23Connect(cameraId);
|
||||
LOGD("Connecting to CameraService v 2.3");
|
||||
camera = Android23Connect(localCameraIndex);
|
||||
}
|
||||
else if (Android3DConnectFuncType Android3DConnect = (Android3DConnectFuncType)dlsym(CameraHALHandle, Android3DConnectName))
|
||||
{
|
||||
LOGD("Connecting to CameraService v 3D");
|
||||
camera = Android3DConnect(cameraId, CAMERA_SUPPORT_MODE_2D);
|
||||
LOGD("Connecting to CameraService v 3D");
|
||||
camera = Android3DConnect(localCameraIndex, CAMERA_SUPPORT_MODE_2D);
|
||||
}
|
||||
else
|
||||
{
|
||||
dlclose(CameraHALHandle);
|
||||
LOGE("Cannot connect to CameraService. Connect method was not found!");
|
||||
return NULL;
|
||||
dlclose(CameraHALHandle);
|
||||
LOGE("Cannot connect to CameraService. Connect method was not found!");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
dlclose(CameraHALHandle);
|
||||
@ -422,7 +469,7 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
|
||||
camera->setListener(handler);
|
||||
|
||||
handler->camera = camera;
|
||||
handler->cameraId = cameraId;
|
||||
handler->cameraId = localCameraIndex;
|
||||
|
||||
if (prevCameraParameters != 0)
|
||||
{
|
||||
|
@ -1486,6 +1486,6 @@ The function reconstructs 3-dimensional points (in homogeneous coordinates) by u
|
||||
|
||||
.. [SteweniusCFS] Stewénius, H., Calibrated Fivepoint solver. http://www.vis.uky.edu/~stewe/FIVEPOINT/
|
||||
|
||||
.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://gregslabaugh.name/publications/euler.pdf
|
||||
.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://www.soi.city.ac.uk/~sbbh653/publications/euler.pdf (verified: 2013-04-15)
|
||||
|
||||
.. [Zhang2000] Z. Zhang. A Flexible New Technique for Camera Calibration. IEEE Transactions on Pattern Analysis and Machine Intelligence, 22(11):1330-1334, 2000.
|
||||
|
@ -2850,8 +2850,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
|
||||
|
||||
if( _mean.data )
|
||||
{
|
||||
CV_Assert( _mean.size() == mean_sz );
|
||||
CV_Assert( _mean.size() == mean_sz );
|
||||
_mean.convertTo(mean, ctype);
|
||||
covar_flags |= CV_COVAR_USE_AVG;
|
||||
}
|
||||
|
||||
calcCovarMatrix( data, covar, mean, covar_flags, ctype );
|
||||
|
@ -42,7 +42,6 @@ template <typename Distance>
|
||||
void find_nearest(const Matrix<typename Distance::ElementType>& dataset, typename Distance::ElementType* query, int* matches, int nn,
|
||||
int skip = 0, Distance distance = Distance())
|
||||
{
|
||||
typedef typename Distance::ElementType ElementType;
|
||||
typedef typename Distance::ResultType DistanceType;
|
||||
int n = nn + skip;
|
||||
|
||||
|
@ -76,7 +76,26 @@ set(highgui_srcs
|
||||
|
||||
file(GLOB highgui_ext_hdrs "include/opencv2/*.hpp" "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
|
||||
if(HAVE_QT)
|
||||
if(HAVE_QT5)
|
||||
set(CMAKE_AUTOMOC ON)
|
||||
set(CMAKE_INCLUDE_CURRENT_DIR ON)
|
||||
|
||||
QT5_ADD_RESOURCES(_RCC_OUTFILES src/window_QT.qrc)
|
||||
list(APPEND highgui_srcs src/window_QT.cpp src/window_QT.h ${_RCC_OUTFILES})
|
||||
|
||||
foreach(dt5_dep Core Gui Widgets Test Concurrent)
|
||||
add_definitions(${Qt5${dt5_dep}_DEFINITIONS})
|
||||
include_directories(${Qt5${dt5_dep}_INCLUDE_DIRS})
|
||||
list(APPEND HIGHGUI_LIBRARIES ${Qt5${dt5_dep}_LIBRARIES})
|
||||
endforeach()
|
||||
|
||||
if(HAVE_QT_OPENGL)
|
||||
add_definitions(${Qt5OpenGL_DEFINITIONS})
|
||||
include_directories(${Qt5OpenGL_INCLUDE_DIRS})
|
||||
list(APPEND HIGHGUI_LIBRARIES ${Qt5OpenGL_LIBRARIES})
|
||||
endif()
|
||||
|
||||
elseif(HAVE_QT)
|
||||
if (HAVE_QT_OPENGL)
|
||||
set(QT_USE_QTOPENGL TRUE)
|
||||
endif()
|
||||
|
@ -306,6 +306,8 @@ enum
|
||||
CV_CAP_OPENNI_ASUS =910, // OpenNI (for Asus Xtion)
|
||||
|
||||
CV_CAP_ANDROID =1000, // Android
|
||||
CV_CAP_ANDROID_BACK =CV_CAP_ANDROID+99, // Android back camera
|
||||
CV_CAP_ANDROID_FRONT =CV_CAP_ANDROID+98, // Android front camera
|
||||
|
||||
CV_CAP_XIAPI =1100, // XIMEA Camera API
|
||||
|
||||
|
@ -1665,6 +1665,17 @@ static int icvSetPropertyCAM_V4L(CvCaptureCAM_V4L* capture, int property_id, dou
|
||||
width = height = 0;
|
||||
}
|
||||
break;
|
||||
case CV_CAP_PROP_FPS:
|
||||
struct v4l2_streamparm setfps;
|
||||
memset (&setfps, 0, sizeof(struct v4l2_streamparm));
|
||||
setfps.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
|
||||
setfps.parm.capture.timeperframe.numerator = 1;
|
||||
setfps.parm.capture.timeperframe.denominator = value;
|
||||
if (xioctl (capture->deviceHandle, VIDIOC_S_PARM, &setfps) < 0){
|
||||
fprintf(stderr, "HIGHGUI ERROR: V4L: Unable to set camera FPS\n");
|
||||
retval=0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
retval = icvSetControl(capture, property_id, value);
|
||||
}
|
||||
|
@ -52,6 +52,11 @@
|
||||
#include <stdio.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp
|
||||
// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details
|
||||
#define mingw_getsp(...) 0
|
||||
#define __builtin_frame_address(...) 0
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
#define XMD_H // prevent redefinition of INT32
|
||||
|
@ -73,6 +73,11 @@
|
||||
#pragma warning( disable: 4611 )
|
||||
#endif
|
||||
|
||||
// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp
|
||||
// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details
|
||||
#define mingw_getsp(...) 0
|
||||
#define __builtin_frame_address(...) 0
|
||||
|
||||
namespace cv
|
||||
{
|
||||
|
||||
|
@ -48,13 +48,13 @@
|
||||
#endif
|
||||
|
||||
#include <QAbstractEventDispatcher>
|
||||
#include <QtGui/QApplication>
|
||||
#include <QApplication>
|
||||
#include <QFile>
|
||||
#include <QPushButton>
|
||||
#include <QtGui/QGraphicsView>
|
||||
#include <QGraphicsView>
|
||||
#include <QSizePolicy>
|
||||
#include <QInputDialog>
|
||||
#include <QtGui/QBoxLayout>
|
||||
#include <QBoxLayout>
|
||||
#include <QSettings>
|
||||
#include <qtimer.h>
|
||||
#include <QtConcurrentRun>
|
||||
@ -78,7 +78,7 @@
|
||||
#include <QRadioButton>
|
||||
#include <QButtonGroup>
|
||||
#include <QMenu>
|
||||
#include <QtTest/QTest>
|
||||
#include <QTest>
|
||||
|
||||
//start private enum
|
||||
enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 };
|
||||
|
@ -342,7 +342,7 @@ Finds the convex hull of a point set.
|
||||
|
||||
:param hull_storage: Output memory storage in the old API (``cvConvexHull2`` returns a sequence containing the convex hull points or their indices).
|
||||
|
||||
:param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The usual screen coordinate system is assumed so that the origin is at the top-left corner, x axis is oriented to the right, and y axis is oriented downwards.
|
||||
:param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The assumed coordinate system has its X axis pointing to the right, and its Y axis pointing upwards.
|
||||
|
||||
:param orientation: Convex hull orientation parameter in the old API, ``CV_CLOCKWISE`` or ``CV_COUNTERCLOCKWISE``.
|
||||
|
||||
|
@ -127,7 +127,6 @@ floodFill_CnIR( Mat& image, Point seed,
|
||||
_Tp newVal, ConnectedComp* region, int flags,
|
||||
std::vector<FFillSegment>* buffer )
|
||||
{
|
||||
typedef typename DataType<_Tp>::channel_type _CTp;
|
||||
_Tp* img = (_Tp*)(image.data + image.step * seed.y);
|
||||
Size roi = image.size();
|
||||
int i, L, R;
|
||||
@ -279,7 +278,6 @@ floodFillGrad_CnIR( Mat& image, Mat& msk,
|
||||
Diff diff, ConnectedComp* region, int flags,
|
||||
std::vector<FFillSegment>* buffer )
|
||||
{
|
||||
typedef typename DataType<_Tp>::channel_type _CTp;
|
||||
int step = (int)image.step, maskStep = (int)msk.step;
|
||||
uchar* pImage = image.data;
|
||||
_Tp* img = (_Tp*)(pImage + step*seed.y);
|
||||
@ -610,7 +608,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
|
||||
&comp, flags, &buffer);
|
||||
else
|
||||
CV_Error(CV_StsUnsupportedFormat, "");
|
||||
|
||||
|
||||
if( rect )
|
||||
*rect = comp.rect;
|
||||
return comp.area;
|
||||
|
@ -1219,8 +1219,6 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
|
||||
const int* yofs, const void* _beta,
|
||||
int xmin, int xmax, int ksize )
|
||||
{
|
||||
typedef typename HResize::value_type T;
|
||||
typedef typename HResize::buf_type WT;
|
||||
typedef typename HResize::alpha_type AT;
|
||||
|
||||
const AT* beta = (const AT*)_beta;
|
||||
|
@ -1,5 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<lint>
|
||||
<issue id="InlinedApi">
|
||||
<ignore path="src\org\opencv\android\JavaCameraView.java" />
|
||||
</issue>
|
||||
<issue id="NewApi">
|
||||
<ignore path="src\org\opencv\android\JavaCameraView.java" />
|
||||
</issue>
|
||||
|
@ -4,8 +4,8 @@
|
||||
<attr name="show_fps" format="boolean"/>
|
||||
<attr name="camera_id" format="integer" >
|
||||
<enum name="any" value="-1" />
|
||||
<enum name="back" value="0" />
|
||||
<enum name="front" value="1" />
|
||||
<enum name="back" value="99" />
|
||||
<enum name="front" value="98" />
|
||||
</attr>
|
||||
</declare-styleable>
|
||||
</resources>
|
||||
|
@ -47,10 +47,14 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
|
||||
protected int mMaxWidth;
|
||||
protected float mScale = 0;
|
||||
protected int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA;
|
||||
protected int mCameraIndex = -1;
|
||||
protected int mCameraIndex = CAMERA_ID_ANY;
|
||||
protected boolean mEnabled;
|
||||
protected FpsMeter mFpsMeter = null;
|
||||
|
||||
public static final int CAMERA_ID_ANY = -1;
|
||||
public static final int CAMERA_ID_BACK = 99;
|
||||
public static final int CAMERA_ID_FRONT = 98;
|
||||
|
||||
public CameraBridgeViewBase(Context context, int cameraId) {
|
||||
super(context);
|
||||
mCameraIndex = cameraId;
|
||||
@ -74,6 +78,7 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
|
||||
getHolder().addCallback(this);
|
||||
mMaxWidth = MAX_UNSPECIFIED;
|
||||
mMaxHeight = MAX_UNSPECIFIED;
|
||||
styledAttrs.recycle();
|
||||
}
|
||||
|
||||
public interface CvCameraViewListener {
|
||||
@ -155,8 +160,6 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
|
||||
mPreviewFormat = format;
|
||||
}
|
||||
|
||||
private CvCameraViewListenerAdapter() {}
|
||||
|
||||
private int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA;
|
||||
private CvCameraViewListener mOldStyleListener;
|
||||
};
|
||||
|
@ -6,6 +6,7 @@ import android.content.Context;
|
||||
import android.graphics.ImageFormat;
|
||||
import android.graphics.SurfaceTexture;
|
||||
import android.hardware.Camera;
|
||||
import android.hardware.Camera.CameraInfo;
|
||||
import android.hardware.Camera.PreviewCallback;
|
||||
import android.os.Build;
|
||||
import android.util.AttributeSet;
|
||||
@ -68,7 +69,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
|
||||
synchronized (this) {
|
||||
mCamera = null;
|
||||
|
||||
if (mCameraIndex == -1) {
|
||||
if (mCameraIndex == CAMERA_ID_ANY) {
|
||||
Log.d(TAG, "Trying to open camera with old open()");
|
||||
try {
|
||||
mCamera = Camera.open();
|
||||
@ -92,11 +93,39 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
|
||||
}
|
||||
} else {
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
|
||||
Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(mCameraIndex) + ")");
|
||||
try {
|
||||
mCamera = Camera.open(mCameraIndex);
|
||||
} catch (RuntimeException e) {
|
||||
Log.e(TAG, "Camera #" + mCameraIndex + "failed to open: " + e.getLocalizedMessage());
|
||||
int localCameraIndex = mCameraIndex;
|
||||
if (mCameraIndex == CAMERA_ID_BACK) {
|
||||
Log.i(TAG, "Trying to open back camera");
|
||||
Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
|
||||
for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
|
||||
Camera.getCameraInfo( camIdx, cameraInfo );
|
||||
if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
|
||||
localCameraIndex = camIdx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (mCameraIndex == CAMERA_ID_FRONT) {
|
||||
Log.i(TAG, "Trying to open front camera");
|
||||
Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
|
||||
for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
|
||||
Camera.getCameraInfo( camIdx, cameraInfo );
|
||||
if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
|
||||
localCameraIndex = camIdx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (localCameraIndex == CAMERA_ID_BACK) {
|
||||
Log.e(TAG, "Back camera not found!");
|
||||
} else if (localCameraIndex == CAMERA_ID_FRONT) {
|
||||
Log.e(TAG, "Front camera not found!");
|
||||
} else {
|
||||
Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")");
|
||||
try {
|
||||
mCamera = Camera.open(localCameraIndex);
|
||||
} catch (RuntimeException e) {
|
||||
Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -179,6 +208,8 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
|
||||
synchronized (this) {
|
||||
if (mCamera != null) {
|
||||
mCamera.stopPreview();
|
||||
mCamera.setPreviewCallback(null);
|
||||
|
||||
mCamera.release();
|
||||
}
|
||||
mCamera = null;
|
||||
@ -267,9 +298,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
|
||||
mRgba.release();
|
||||
}
|
||||
|
||||
private JavaCameraFrame(CvCameraViewFrame obj) {
|
||||
}
|
||||
|
||||
private Mat mYuvFrameData;
|
||||
private Mat mRgba;
|
||||
private int mWidth;
|
||||
|
@ -53,14 +53,16 @@ public class NativeCameraView extends CameraBridgeViewBase {
|
||||
/* 1. We need to stop thread which updating the frames
|
||||
* 2. Stop camera and release it
|
||||
*/
|
||||
try {
|
||||
mStopThread = true;
|
||||
mThread.join();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
mThread = null;
|
||||
mStopThread = false;
|
||||
if (mThread != null) {
|
||||
try {
|
||||
mStopThread = true;
|
||||
mThread.join();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
mThread = null;
|
||||
mStopThread = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now release camera */
|
||||
@ -131,17 +133,17 @@ public class NativeCameraView extends CameraBridgeViewBase {
|
||||
}
|
||||
}
|
||||
|
||||
private class NativeCameraFrame implements CvCameraViewFrame {
|
||||
private static class NativeCameraFrame implements CvCameraViewFrame {
|
||||
|
||||
@Override
|
||||
public Mat rgba() {
|
||||
mCamera.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
|
||||
mCapture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
|
||||
return mRgba;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Mat gray() {
|
||||
mCamera.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME);
|
||||
mCapture.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME);
|
||||
return mGray;
|
||||
}
|
||||
|
||||
@ -158,9 +160,6 @@ public class NativeCameraView extends CameraBridgeViewBase {
|
||||
|
||||
private class CameraWorker implements Runnable {
|
||||
|
||||
private Mat mRgba = new Mat();
|
||||
private Mat mGray = new Mat();
|
||||
|
||||
public void run() {
|
||||
do {
|
||||
if (!mCamera.grab()) {
|
||||
|
@ -61,7 +61,7 @@ protected:
|
||||
{
|
||||
int ObjNum = m_TrackList.GetBlobNum();
|
||||
int i;
|
||||
char video_name[1024];
|
||||
char video_name[1024+1];
|
||||
char* struct_name = NULL;
|
||||
CvFileStorage* storage = cvOpenFileStorage(m_pFileName,NULL,CV_STORAGE_WRITE_TEXT);
|
||||
|
||||
|
@ -117,10 +117,10 @@ class CvKDTreeWrap : public CvFeatureTree {
|
||||
CvMat* results) {
|
||||
int rn = results->rows * results->cols;
|
||||
std::vector<int> inbounds;
|
||||
dispatch_cvtype(mat, ((__treetype*)data)->
|
||||
find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr,
|
||||
assert(CV_MAT_DEPTH(mat->type) == CV_32F || CV_MAT_DEPTH(mat->type) == CV_64F);
|
||||
((__treetype*)data)->find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr,
|
||||
(typename __treetype::scalar_type*)bounds_max->data.ptr,
|
||||
inbounds));
|
||||
inbounds);
|
||||
std::copy(inbounds.begin(),
|
||||
inbounds.begin() + std::min((int)inbounds.size(), rn),
|
||||
(int*) results->data.ptr);
|
||||
|
@ -1140,7 +1140,7 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
|
||||
|
||||
Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
|
||||
Size scaledImageSize( cvRound( grayImage.cols/factor ), cvRound( grayImage.rows/factor ) );
|
||||
Size processingRectSize( scaledImageSize.width - originalWindowSize.width + 1, scaledImageSize.height - originalWindowSize.height + 1 );
|
||||
Size processingRectSize( scaledImageSize.width - originalWindowSize.width, scaledImageSize.height - originalWindowSize.height );
|
||||
|
||||
if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
|
||||
break;
|
||||
|
@ -151,7 +151,7 @@ namespace cv
|
||||
static Context *getContext();
|
||||
static void setContext(Info &oclinfo);
|
||||
|
||||
enum {CL_DOUBLE, CL_UNIFIED_MEM};
|
||||
enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
|
||||
bool supportsFeature(int ftype);
|
||||
size_t computeUnits();
|
||||
size_t maxWorkGroupSize();
|
||||
@ -264,9 +264,9 @@ namespace cv
|
||||
void create(Size size, int type);
|
||||
|
||||
//! allocates new oclMatrix with specified device memory type.
|
||||
void createEx(int rows, int cols, int type,
|
||||
void createEx(int rows, int cols, int type,
|
||||
DevMemRW rw_type, DevMemType mem_type, void* hptr = 0);
|
||||
void createEx(Size size, int type, DevMemRW rw_type,
|
||||
void createEx(Size size, int type, DevMemRW rw_type,
|
||||
DevMemType mem_type, void* hptr = 0);
|
||||
|
||||
//! decreases reference counter;
|
||||
@ -406,6 +406,9 @@ namespace cv
|
||||
//! computes element-wise product of the two arrays (c = a * b)
|
||||
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
|
||||
CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
|
||||
//! multiplies matrix to a number (dst = scalar * src)
|
||||
// supports CV_32FC1 only
|
||||
CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
|
||||
//! computes element-wise quotient of the two arrays (c = a / b)
|
||||
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
|
||||
CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
|
||||
@ -823,7 +826,44 @@ namespace cv
|
||||
};
|
||||
#endif
|
||||
|
||||
class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier
|
||||
{
|
||||
public:
|
||||
OclCascadeClassifierBuf() :
|
||||
m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
|
||||
|
||||
~OclCascadeClassifierBuf() {}
|
||||
|
||||
void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
|
||||
double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
|
||||
Size minSize = Size(), Size maxSize = Size());
|
||||
void release();
|
||||
|
||||
private:
|
||||
void Init(const int rows, const int cols, double scaleFactor, int flags,
|
||||
const int outputsz, const size_t localThreads[],
|
||||
Size minSize, Size maxSize);
|
||||
void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz);
|
||||
void CreateFactorRelatedBufs(const int rows, const int cols, const int flags,
|
||||
const double scaleFactor, const size_t localThreads[],
|
||||
Size minSize, Size maxSize);
|
||||
void GenResult(CV_OUT std::vector<cv::Rect>& faces, const std::vector<cv::Rect> &rectList, const std::vector<int> &rweights);
|
||||
|
||||
int m_rows;
|
||||
int m_cols;
|
||||
int m_flags;
|
||||
int m_loopcount;
|
||||
int m_nodenum;
|
||||
bool findBiggestObject;
|
||||
bool initialized;
|
||||
double m_scaleFactor;
|
||||
Size m_minSize;
|
||||
Size m_maxSize;
|
||||
std::vector<Size> sizev;
|
||||
std::vector<float> scalev;
|
||||
oclMat gimg1, gsum, gsqsum;
|
||||
void * buffers;
|
||||
};
|
||||
|
||||
/////////////////////////////// Pyramid /////////////////////////////////////
|
||||
CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
|
||||
@ -849,7 +889,6 @@ namespace cv
|
||||
std::vector<oclMat> image_sqsums;
|
||||
};
|
||||
|
||||
|
||||
//! computes the proximity map for the raster template and the image where the template is searched for
|
||||
// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
|
||||
// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
|
||||
@ -1093,13 +1132,11 @@ namespace cv
|
||||
/****************************************************************************************\
|
||||
* Distance *
|
||||
\****************************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
struct CV_EXPORTS Accumulator
|
||||
{
|
||||
typedef T Type;
|
||||
};
|
||||
|
||||
template<> struct Accumulator<unsigned char>
|
||||
{
|
||||
typedef float Type;
|
||||
@ -1173,469 +1210,244 @@ namespace cv
|
||||
{
|
||||
public:
|
||||
enum DistType {L1Dist = 0, L2Dist, HammingDist};
|
||||
|
||||
explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
|
||||
|
||||
|
||||
|
||||
// Add descriptors to train descriptor collection
|
||||
|
||||
void add(const std::vector<oclMat> &descCollection);
|
||||
|
||||
|
||||
|
||||
// Get train descriptors collection
|
||||
|
||||
const std::vector<oclMat> &getTrainDescriptors() const;
|
||||
|
||||
|
||||
|
||||
// Clear train descriptors collection
|
||||
|
||||
void clear();
|
||||
|
||||
|
||||
|
||||
// Return true if there are not train descriptors in collection
|
||||
|
||||
bool empty() const;
|
||||
|
||||
|
||||
|
||||
// Return true if the matcher supports mask in match methods
|
||||
|
||||
bool isMaskSupported() const;
|
||||
|
||||
|
||||
|
||||
// Find one best match for each query descriptor
|
||||
|
||||
void matchSingle(const oclMat &query, const oclMat &train,
|
||||
|
||||
oclMat &trainIdx, oclMat &distance,
|
||||
|
||||
const oclMat &mask = oclMat());
|
||||
|
||||
|
||||
|
||||
// Download trainIdx and distance and convert it to CPU vector with DMatch
|
||||
|
||||
static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
|
||||
|
||||
// Convert trainIdx and distance to vector with DMatch
|
||||
|
||||
static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
|
||||
|
||||
|
||||
|
||||
// Find one best match for each query descriptor
|
||||
|
||||
void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
|
||||
|
||||
|
||||
|
||||
// Make gpu collection of trains and masks in suitable format for matchCollection function
|
||||
|
||||
void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
|
||||
|
||||
|
||||
|
||||
// Find one best match from train collection for each query descriptor
|
||||
|
||||
void matchCollection(const oclMat &query, const oclMat &trainCollection,
|
||||
|
||||
oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
|
||||
|
||||
const oclMat &masks = oclMat());
|
||||
|
||||
|
||||
|
||||
// Download trainIdx, imgIdx and distance and convert it to vector with DMatch
|
||||
|
||||
static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
|
||||
|
||||
// Convert trainIdx, imgIdx and distance to vector with DMatch
|
||||
|
||||
static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
|
||||
|
||||
|
||||
|
||||
// Find one best match from train collection for each query descriptor.
|
||||
|
||||
void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
|
||||
|
||||
|
||||
|
||||
// Find k best matches for each query descriptor (in increasing order of distances)
|
||||
|
||||
void knnMatchSingle(const oclMat &query, const oclMat &train,
|
||||
|
||||
oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
|
||||
|
||||
const oclMat &mask = oclMat());
|
||||
|
||||
|
||||
|
||||
// Download trainIdx and distance and convert it to vector with DMatch
|
||||
|
||||
// compactResult is used when mask is not empty. If compactResult is false matches
|
||||
|
||||
// vector will have the same size as queryDescriptors rows. If compactResult is true
|
||||
|
||||
// matches vector will not contain matches for fully masked out query descriptors.
|
||||
|
||||
static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
// Convert trainIdx and distance to vector with DMatch
|
||||
|
||||
static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find k best matches for each query descriptor (in increasing order of distances).
|
||||
|
||||
// compactResult is used when mask is not empty. If compactResult is false matches
|
||||
|
||||
// vector will have the same size as queryDescriptors rows. If compactResult is true
|
||||
|
||||
// matches vector will not contain matches for fully masked out query descriptors.
|
||||
|
||||
void knnMatch(const oclMat &query, const oclMat &train,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
|
||||
|
||||
bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find k best matches from train collection for each query descriptor (in increasing order of distances)
|
||||
|
||||
void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
|
||||
|
||||
oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
|
||||
|
||||
const oclMat &maskCollection = oclMat());
|
||||
|
||||
|
||||
|
||||
// Download trainIdx and distance and convert it to vector with DMatch
|
||||
|
||||
// compactResult is used when mask is not empty. If compactResult is false matches
|
||||
|
||||
// vector will have the same size as queryDescriptors rows. If compactResult is true
|
||||
|
||||
// matches vector will not contain matches for fully masked out query descriptors.
|
||||
|
||||
static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
// Convert trainIdx and distance to vector with DMatch
|
||||
|
||||
static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find k best matches for each query descriptor (in increasing order of distances).
|
||||
|
||||
// compactResult is used when mask is not empty. If compactResult is false matches
|
||||
|
||||
// vector will have the same size as queryDescriptors rows. If compactResult is true
|
||||
|
||||
// matches vector will not contain matches for fully masked out query descriptors.
|
||||
|
||||
void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
|
||||
|
||||
const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find best matches for each query descriptor which have distance less than maxDistance.
|
||||
|
||||
// nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
|
||||
|
||||
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
|
||||
|
||||
// because it didn't have enough memory.
|
||||
|
||||
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
|
||||
|
||||
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
|
||||
|
||||
// Matches doesn't sorted.
|
||||
|
||||
void radiusMatchSingle(const oclMat &query, const oclMat &train,
|
||||
|
||||
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
|
||||
|
||||
const oclMat &mask = oclMat());
|
||||
|
||||
|
||||
|
||||
// Download trainIdx, nMatches and distance and convert it to vector with DMatch.
|
||||
|
||||
// matches will be sorted in increasing order of distances.
|
||||
|
||||
// compactResult is used when mask is not empty. If compactResult is false matches
|
||||
|
||||
// vector will have the same size as queryDescriptors rows. If compactResult is true
|
||||
|
||||
// matches vector will not contain matches for fully masked out query descriptors.
|
||||
|
||||
static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
// Convert trainIdx, nMatches and distance to vector with DMatch.
|
||||
|
||||
static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find best matches for each query descriptor which have distance less than maxDistance
|
||||
|
||||
// in increasing order of distances).
|
||||
|
||||
void radiusMatch(const oclMat &query, const oclMat &train,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, float maxDistance,
|
||||
|
||||
const oclMat &mask = oclMat(), bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find best matches for each query descriptor which have distance less than maxDistance.
|
||||
|
||||
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
|
||||
|
||||
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
|
||||
|
||||
// Matches doesn't sorted.
|
||||
|
||||
void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
|
||||
|
||||
const std::vector<oclMat> &masks = std::vector<oclMat>());
|
||||
|
||||
|
||||
|
||||
// Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
|
||||
|
||||
// matches will be sorted in increasing order of distances.
|
||||
|
||||
// compactResult is used when mask is not empty. If compactResult is false matches
|
||||
|
||||
// vector will have the same size as queryDescriptors rows. If compactResult is true
|
||||
|
||||
// matches vector will not contain matches for fully masked out query descriptors.
|
||||
|
||||
static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
// Convert trainIdx, nMatches and distance to vector with DMatch.
|
||||
|
||||
static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
|
||||
|
||||
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
// Find best matches from train collection for each query descriptor which have distance less than
|
||||
|
||||
// maxDistance (in increasing order of distances).
|
||||
|
||||
void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
|
||||
|
||||
const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
|
||||
|
||||
|
||||
|
||||
DistType distType;
|
||||
|
||||
|
||||
|
||||
private:
|
||||
|
||||
std::vector<oclMat> trainDescCollection;
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <class Distance>
|
||||
|
||||
class CV_EXPORTS BruteForceMatcher_OCL;
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
|
||||
class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
|
||||
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
|
||||
|
||||
explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
|
||||
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
||||
class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
|
||||
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
|
||||
|
||||
explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
|
||||
|
||||
};
|
||||
|
||||
template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
|
||||
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
|
||||
|
||||
explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
|
||||
|
||||
};
|
||||
|
||||
|
||||
class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
|
||||
{
|
||||
public:
|
||||
explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
|
||||
};
|
||||
|
||||
/////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS PyrLKOpticalFlow
|
||||
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
PyrLKOpticalFlow()
|
||||
|
||||
{
|
||||
|
||||
winSize = Size(21, 21);
|
||||
|
||||
maxLevel = 3;
|
||||
|
||||
iters = 30;
|
||||
|
||||
derivLambda = 0.5;
|
||||
|
||||
useInitialFlow = false;
|
||||
|
||||
minEigThreshold = 1e-4f;
|
||||
|
||||
getMinEigenVals = false;
|
||||
|
||||
isDeviceArch11_ = false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
|
||||
|
||||
oclMat &status, oclMat *err = 0);
|
||||
|
||||
|
||||
|
||||
void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
|
||||
|
||||
|
||||
|
||||
Size winSize;
|
||||
|
||||
int maxLevel;
|
||||
|
||||
int iters;
|
||||
|
||||
double derivLambda;
|
||||
|
||||
bool useInitialFlow;
|
||||
|
||||
float minEigThreshold;
|
||||
|
||||
bool getMinEigenVals;
|
||||
|
||||
|
||||
|
||||
void releaseMemory()
|
||||
|
||||
{
|
||||
|
||||
dx_calcBuf_.release();
|
||||
|
||||
dy_calcBuf_.release();
|
||||
|
||||
|
||||
|
||||
prevPyr_.clear();
|
||||
|
||||
nextPyr_.clear();
|
||||
|
||||
|
||||
|
||||
dx_buf_.release();
|
||||
|
||||
dy_buf_.release();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
private:
|
||||
|
||||
void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
|
||||
|
||||
|
||||
|
||||
void buildImagePyramid(const oclMat &img0, std::vector<oclMat> &pyr, bool withBorder);
|
||||
|
||||
|
||||
|
||||
oclMat dx_calcBuf_;
|
||||
|
||||
oclMat dy_calcBuf_;
|
||||
|
||||
|
||||
|
||||
std::vector<oclMat> prevPyr_;
|
||||
|
||||
std::vector<oclMat> nextPyr_;
|
||||
|
||||
|
||||
|
||||
oclMat dx_buf_;
|
||||
|
||||
oclMat dy_buf_;
|
||||
|
||||
|
||||
|
||||
oclMat uPyr_[2];
|
||||
|
||||
oclMat vPyr_[2];
|
||||
|
||||
|
||||
|
||||
bool isDeviceArch11_;
|
||||
|
||||
};
|
||||
//////////////// build warping maps ////////////////////
|
||||
//! builds plane warping maps
|
||||
@ -1706,6 +1518,7 @@ namespace cv
|
||||
private:
|
||||
oclMat minSSD, leBuf, riBuf;
|
||||
};
|
||||
|
||||
class CV_EXPORTS StereoBeliefPropagation
|
||||
{
|
||||
public:
|
||||
@ -1736,6 +1549,133 @@ namespace cv
|
||||
std::vector<oclMat> datas;
|
||||
oclMat out;
|
||||
};
|
||||
|
||||
class CV_EXPORTS StereoConstantSpaceBP
|
||||
{
|
||||
public:
|
||||
enum { DEFAULT_NDISP = 128 };
|
||||
enum { DEFAULT_ITERS = 8 };
|
||||
enum { DEFAULT_LEVELS = 4 };
|
||||
enum { DEFAULT_NR_PLANE = 4 };
|
||||
static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
|
||||
explicit StereoConstantSpaceBP(
|
||||
int ndisp = DEFAULT_NDISP,
|
||||
int iters = DEFAULT_ITERS,
|
||||
int levels = DEFAULT_LEVELS,
|
||||
int nr_plane = DEFAULT_NR_PLANE,
|
||||
int msg_type = CV_32F);
|
||||
StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
|
||||
float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
|
||||
int min_disp_th = 0,
|
||||
int msg_type = CV_32F);
|
||||
void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
|
||||
int ndisp;
|
||||
int iters;
|
||||
int levels;
|
||||
int nr_plane;
|
||||
float max_data_term;
|
||||
float data_weight;
|
||||
float max_disc_term;
|
||||
float disc_single_jump;
|
||||
int min_disp_th;
|
||||
int msg_type;
|
||||
bool use_local_init_data_cost;
|
||||
private:
|
||||
oclMat u[2], d[2], l[2], r[2];
|
||||
oclMat disp_selected_pyr[2];
|
||||
oclMat data_cost;
|
||||
oclMat data_cost_selected;
|
||||
oclMat temp;
|
||||
oclMat out;
|
||||
};
|
||||
|
||||
// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
|
||||
//
|
||||
// see reference:
|
||||
// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
|
||||
// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
|
||||
class CV_EXPORTS OpticalFlowDual_TVL1_OCL
|
||||
{
|
||||
public:
|
||||
OpticalFlowDual_TVL1_OCL();
|
||||
|
||||
void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
|
||||
|
||||
void collectGarbage();
|
||||
|
||||
/**
|
||||
* Time step of the numerical scheme.
|
||||
*/
|
||||
double tau;
|
||||
|
||||
/**
|
||||
* Weight parameter for the data term, attachment parameter.
|
||||
* This is the most relevant parameter, which determines the smoothness of the output.
|
||||
* The smaller this parameter is, the smoother the solutions we obtain.
|
||||
* It depends on the range of motions of the images, so its value should be adapted to each image sequence.
|
||||
*/
|
||||
double lambda;
|
||||
|
||||
/**
|
||||
* Weight parameter for (u - v)^2, tightness parameter.
|
||||
* It serves as a link between the attachment and the regularization terms.
|
||||
* In theory, it should have a small value in order to maintain both parts in correspondence.
|
||||
* The method is stable for a large range of values of this parameter.
|
||||
*/
|
||||
double theta;
|
||||
|
||||
/**
|
||||
* Number of scales used to create the pyramid of images.
|
||||
*/
|
||||
int nscales;
|
||||
|
||||
/**
|
||||
* Number of warpings per scale.
|
||||
* Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
|
||||
* This is a parameter that assures the stability of the method.
|
||||
* It also affects the running time, so it is a compromise between speed and accuracy.
|
||||
*/
|
||||
int warps;
|
||||
|
||||
/**
|
||||
* Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
|
||||
* A small value will yield more accurate solutions at the expense of a slower convergence.
|
||||
*/
|
||||
double epsilon;
|
||||
|
||||
/**
|
||||
* Stopping criterion iterations number used in the numerical scheme.
|
||||
*/
|
||||
int iterations;
|
||||
|
||||
bool useInitialFlow;
|
||||
|
||||
private:
|
||||
void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
|
||||
|
||||
std::vector<oclMat> I0s;
|
||||
std::vector<oclMat> I1s;
|
||||
std::vector<oclMat> u1s;
|
||||
std::vector<oclMat> u2s;
|
||||
|
||||
oclMat I1x_buf;
|
||||
oclMat I1y_buf;
|
||||
|
||||
oclMat I1w_buf;
|
||||
oclMat I1wx_buf;
|
||||
oclMat I1wy_buf;
|
||||
|
||||
oclMat grad_buf;
|
||||
oclMat rho_c_buf;
|
||||
|
||||
oclMat p11_buf;
|
||||
oclMat p12_buf;
|
||||
oclMat p21_buf;
|
||||
oclMat p22_buf;
|
||||
|
||||
oclMat diff_buf;
|
||||
oclMat norm_buf;
|
||||
};
|
||||
}
|
||||
}
|
||||
#if defined _MSC_VER && _MSC_VER >= 1200
|
||||
|
@ -45,4 +45,4 @@
|
||||
#error this is a compatibility header which should not be used inside the OpenCV library
|
||||
#endif
|
||||
|
||||
#include "opencv2/ocl.hpp"
|
||||
#include "opencv2/ocl.hpp"
|
||||
|
@ -22,6 +22,7 @@
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
// Rock Li, Rock.Li@amd.com
|
||||
// Zailong Wu, bullet@yeah.net
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -61,8 +62,6 @@ namespace cv
|
||||
namespace ocl
|
||||
{
|
||||
////////////////////////////////OpenCL kernel strings/////////////////////
|
||||
extern const char *bitwise;
|
||||
extern const char *bitwiseM;
|
||||
extern const char *transpose_kernel;
|
||||
extern const char *arithm_nonzero;
|
||||
extern const char *arithm_sum;
|
||||
@ -76,19 +75,11 @@ namespace cv
|
||||
extern const char *arithm_add;
|
||||
extern const char *arithm_add_scalar;
|
||||
extern const char *arithm_add_scalar_mask;
|
||||
extern const char *arithm_bitwise_binary;
|
||||
extern const char *arithm_bitwise_binary_mask;
|
||||
extern const char *arithm_bitwise_binary_scalar;
|
||||
extern const char *arithm_bitwise_binary_scalar_mask;
|
||||
extern const char *arithm_bitwise_not;
|
||||
extern const char *arithm_bitwise_and;
|
||||
extern const char *arithm_bitwise_and_mask;
|
||||
extern const char *arithm_bitwise_and_scalar;
|
||||
extern const char *arithm_bitwise_and_scalar_mask;
|
||||
extern const char *arithm_bitwise_or;
|
||||
extern const char *arithm_bitwise_or_mask;
|
||||
extern const char *arithm_bitwise_or_scalar;
|
||||
extern const char *arithm_bitwise_or_scalar_mask;
|
||||
extern const char *arithm_bitwise_xor;
|
||||
extern const char *arithm_bitwise_xor_mask;
|
||||
extern const char *arithm_bitwise_xor_scalar;
|
||||
extern const char *arithm_bitwise_xor_scalar_mask;
|
||||
extern const char *arithm_compare_eq;
|
||||
extern const char *arithm_compare_ne;
|
||||
extern const char *arithm_mul;
|
||||
@ -126,7 +117,7 @@ inline int divUp(int total, int grain)
|
||||
/////////////////////// add subtract multiply divide /////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
template<typename T>
|
||||
void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
String kernelName, const char **kernelString, void *_scalar, int op_type = 0)
|
||||
{
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
@ -195,12 +186,12 @@ void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
|
||||
}
|
||||
}
|
||||
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
String kernelName, const char **kernelString, int op_type = 0)
|
||||
{
|
||||
arithmetic_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type);
|
||||
}
|
||||
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask,
|
||||
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask,
|
||||
String kernelName, const char **kernelString, int op_type = 0)
|
||||
{
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
@ -295,6 +286,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
|
||||
else
|
||||
arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
|
||||
}
|
||||
|
||||
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
|
||||
{
|
||||
|
||||
@ -479,6 +471,11 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons
|
||||
|
||||
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
|
||||
}
|
||||
void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
|
||||
{
|
||||
String kernelName = "arithm_muls";
|
||||
arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar);
|
||||
}
|
||||
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
|
||||
{
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
@ -1647,7 +1644,8 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, String kernelName, cons
|
||||
|
||||
|
||||
template<typename T>
|
||||
void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
|
||||
void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
|
||||
const char **kernelString, void *_scalar, const char* _opt = NULL)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
|
||||
@ -1697,13 +1695,15 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String ker
|
||||
args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
|
||||
}
|
||||
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, _opt);
|
||||
}
|
||||
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString)
|
||||
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
String kernelName, const char **kernelString, const char* _opt = NULL)
|
||||
{
|
||||
bitwise_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL);
|
||||
bitwise_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, _opt);
|
||||
}
|
||||
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString)
|
||||
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
const oclMat &mask, String kernelName, const char **kernelString, const char* _opt = NULL)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
|
||||
@ -1751,12 +1751,13 @@ static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, con
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth);
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, _opt);
|
||||
}
|
||||
|
||||
|
||||
template <typename WT , typename CL_WT>
|
||||
void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar)
|
||||
void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt = NULL)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
@ -1818,14 +1819,16 @@ void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, con
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&isMatSubScalar));
|
||||
}
|
||||
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth);
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, opt);
|
||||
}
|
||||
|
||||
|
||||
typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar);
|
||||
typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt);
|
||||
|
||||
|
||||
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar)
|
||||
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt)
|
||||
{
|
||||
static BitwiseFuncS tab[8] =
|
||||
{
|
||||
@ -1853,11 +1856,12 @@ static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
BitwiseFuncS func = tab[src1.depth()];
|
||||
if(func == 0)
|
||||
cv::error(Error::StsBadArg, "Unsupported arithmetic operation", "", __FILE__, __LINE__);
|
||||
func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar);
|
||||
func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar, opt);
|
||||
}
|
||||
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString)
|
||||
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
const oclMat &mask, String kernelName, const char **kernelString, const char * opt = NULL)
|
||||
{
|
||||
bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0);
|
||||
bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0, opt);
|
||||
}
|
||||
|
||||
void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
|
||||
@ -1880,12 +1884,13 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co
|
||||
std::cout << "Selected device do not support double" << std::endl;
|
||||
return;
|
||||
}
|
||||
oclMat emptyMat;
|
||||
String kernelName = mask.empty() ? "arithm_bitwise_or" : "arithm_bitwise_or_with_mask";
|
||||
|
||||
String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
|
||||
static const char opt [] = "-D OP_BINARY=|";
|
||||
if (mask.empty())
|
||||
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_or);
|
||||
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
|
||||
else
|
||||
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_or_mask);
|
||||
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
|
||||
}
|
||||
|
||||
|
||||
@ -1896,11 +1901,12 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co
|
||||
std::cout << "Selected device do not support double" << std::endl;
|
||||
return;
|
||||
}
|
||||
String kernelName = mask.data ? "arithm_s_bitwise_or_with_mask" : "arithm_s_bitwise_or";
|
||||
static const char opt [] = "-D OP_BINARY=|";
|
||||
String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
|
||||
if (mask.data)
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_or_scalar_mask);
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
|
||||
else
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_or_scalar);
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
|
||||
}
|
||||
|
||||
void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
||||
@ -1913,12 +1919,13 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c
|
||||
}
|
||||
oclMat emptyMat;
|
||||
|
||||
String kernelName = mask.empty() ? "arithm_bitwise_and" : "arithm_bitwise_and_with_mask";
|
||||
String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
|
||||
|
||||
static const char opt [] = "-D OP_BINARY=&";
|
||||
if (mask.empty())
|
||||
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_and);
|
||||
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
|
||||
else
|
||||
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_mask);
|
||||
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
|
||||
}
|
||||
|
||||
void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
|
||||
@ -1928,11 +1935,12 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c
|
||||
std::cout << "Selected device do not support double" << std::endl;
|
||||
return;
|
||||
}
|
||||
String kernelName = mask.data ? "arithm_s_bitwise_and_with_mask" : "arithm_s_bitwise_and";
|
||||
static const char opt [] = "-D OP_BINARY=&";
|
||||
String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
|
||||
if (mask.data)
|
||||
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_scalar_mask);
|
||||
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
|
||||
else
|
||||
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_scalar);
|
||||
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
|
||||
}
|
||||
|
||||
void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
||||
@ -1942,14 +1950,14 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c
|
||||
std::cout << "Selected device do not support double" << std::endl;
|
||||
return;
|
||||
}
|
||||
oclMat emptyMat;
|
||||
String kernelName = mask.empty() ? "arithm_bitwise_xor" : "arithm_bitwise_xor_with_mask";
|
||||
String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
|
||||
|
||||
static const char opt [] = "-D OP_BINARY=^";
|
||||
|
||||
if (mask.empty())
|
||||
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_xor);
|
||||
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
|
||||
else
|
||||
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_mask);
|
||||
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
|
||||
}
|
||||
|
||||
|
||||
@ -1961,11 +1969,12 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, c
|
||||
std::cout << "Selected device do not support double" << std::endl;
|
||||
return;
|
||||
}
|
||||
String kernelName = mask.data ? "arithm_s_bitwise_xor_with_mask" : "arithm_s_bitwise_xor";
|
||||
String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
|
||||
static const char opt [] = "-D OP_BINARY=^";
|
||||
if (mask.data)
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar_mask);
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
|
||||
else
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar);
|
||||
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
|
||||
}
|
||||
|
||||
oclMat cv::ocl::operator ~ (const oclMat &src)
|
||||
|
@ -844,8 +844,8 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &quer
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
|
||||
const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
|
||||
// typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
|
||||
// const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
|
||||
@ -992,7 +992,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec
|
||||
|
||||
// radiusMatchSingle
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
|
||||
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
|
||||
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
|
||||
{
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
@ -1094,9 +1094,9 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &qu
|
||||
if (query.empty() || empty())
|
||||
return;
|
||||
|
||||
#if 0
|
||||
typedef void (*caller_t)(const oclMat & query, const oclMat * trains, int n, float maxDistance, const oclMat * masks,
|
||||
const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance, const oclMat & nMatches);
|
||||
#if 0
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
{
|
||||
|
@ -60,7 +60,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
|
||||
const oclMat &src3, double beta, oclMat &dst, int flags)
|
||||
{
|
||||
CV_Assert(src1.cols == src2.rows &&
|
||||
(src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols));
|
||||
(src3.empty() || (src1.rows == src3.rows && src2.cols == src3.cols)));
|
||||
CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported
|
||||
if(!src3.empty())
|
||||
{
|
||||
|
@ -20,6 +20,7 @@
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Wu Xinglong, wxl370@126.com
|
||||
// Wang Yao, bitwangyaoyao@gmail.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -144,7 +145,8 @@ typedef struct
|
||||
int imgoff;
|
||||
float factor;
|
||||
} detect_piramid_info;
|
||||
#ifdef WIN32
|
||||
|
||||
#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__
|
||||
#define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
|
||||
typedef _ALIGNED_ON(128) struct GpuHidHaarFeature
|
||||
{
|
||||
@ -841,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
|
||||
} /* j */
|
||||
}
|
||||
}
|
||||
|
||||
CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor,
|
||||
int minNeighbors, int flags, CvSize minSize, CvSize maxSize)
|
||||
{
|
||||
CvHaarClassifierCascade *cascade = oldCascade;
|
||||
|
||||
//double alltime = (double)cvGetTickCount();
|
||||
//double t = (double)cvGetTickCount();
|
||||
const double GROUP_EPS = 0.2;
|
||||
oclMat gtemp, gsum1, gtilted1, gsqsum1, gnormImg, gsumcanny;
|
||||
CvSeq *result_seq = 0;
|
||||
cv::Ptr<CvMemStorage> temp_storage;
|
||||
|
||||
@ -860,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int datasize=0;
|
||||
int totalclassifier=0;
|
||||
|
||||
//void *out;
|
||||
GpuHidHaarClassifierCascade *gcascade;
|
||||
GpuHidHaarStageClassifier *stage;
|
||||
GpuHidHaarClassifier *classifier;
|
||||
@ -869,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int *candidate;
|
||||
cl_int status;
|
||||
|
||||
// bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
|
||||
bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
|
||||
// bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
|
||||
|
||||
//double t = 0;
|
||||
if( maxSize.height == 0 || maxSize.width == 0 )
|
||||
{
|
||||
maxSize.height = gimg.rows;
|
||||
@ -895,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
if( findBiggestObject )
|
||||
flags &= ~CV_HAAR_SCALE_IMAGE;
|
||||
|
||||
//gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1);
|
||||
//gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
|
||||
//gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 );
|
||||
|
||||
if( !cascade->hid_cascade )
|
||||
/*out = (void *)*/gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
|
||||
if( cascade->hid_cascade->has_tilted_features )
|
||||
gtilted1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
|
||||
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
|
||||
|
||||
result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage );
|
||||
|
||||
if( CV_MAT_CN(gimg.type()) > 1 )
|
||||
{
|
||||
oclMat gtemp;
|
||||
cvtColor( gimg, gtemp, COLOR_BGR2GRAY );
|
||||
gimg = gtemp;
|
||||
}
|
||||
|
||||
if( findBiggestObject )
|
||||
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
|
||||
if( gimg.cols < minSize.width || gimg.rows < minSize.height )
|
||||
CV_Error(CV_StsError, "Image too small");
|
||||
@ -923,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
if( (flags & CV_HAAR_SCALE_IMAGE) )
|
||||
{
|
||||
CvSize winSize0 = cascade->orig_window_size;
|
||||
//float scalefactor = 1.1f;
|
||||
//float factor = 1.f;
|
||||
int totalheight = 0;
|
||||
int indexy = 0;
|
||||
CvSize sz;
|
||||
//t = (double)cvGetTickCount();
|
||||
std::vector<CvSize> sizev;
|
||||
std::vector<float> scalev;
|
||||
for(factor = 1.f;; factor *= scaleFactor)
|
||||
@ -949,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
sizev.push_back(sz);
|
||||
scalev.push_back(factor);
|
||||
}
|
||||
//int flag = 0;
|
||||
|
||||
oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
|
||||
oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
|
||||
oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);
|
||||
|
||||
//cl_mem cascadebuffer;
|
||||
cl_mem stagebuffer;
|
||||
//cl_mem classifierbuffer;
|
||||
cl_mem nodebuffer;
|
||||
cl_mem candidatebuffer;
|
||||
cl_mem scaleinfobuffer;
|
||||
//cl_kernel kernel;
|
||||
//kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade");
|
||||
cv::Rect roi, roi2;
|
||||
cv::Mat imgroi, imgroisq;
|
||||
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
|
||||
@ -970,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
|
||||
size_t blocksize = 8;
|
||||
size_t localThreads[3] = { blocksize, blocksize , 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0],
|
||||
size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
|
||||
localThreads[1], 1
|
||||
};
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
int loopcount = sizev.size();
|
||||
detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
|
||||
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
// printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//int *it =scaleinfo;
|
||||
// t = (double)cvGetTickCount();
|
||||
|
||||
for( int i = 0; i < loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
@ -991,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
resizeroi = gimg1(roi2);
|
||||
gimgroi = gsum(roi);
|
||||
gimgroisq = gsqsum(roi);
|
||||
//scaleinfo[i].rows = gimgroi.rows;
|
||||
int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
|
||||
int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
|
||||
scaleinfo[i].width_height = (width << 16) | height;
|
||||
@ -999,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
//outputsz +=width*height;
|
||||
|
||||
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
scaleinfo[i].imgoff = gimgroi.offset >> 2;
|
||||
scaleinfo[i].factor = factor;
|
||||
//printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n",
|
||||
// scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline,
|
||||
// scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor);
|
||||
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
|
||||
//cv::imwrite("D:\\1.jpg",gimg1);
|
||||
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
|
||||
//cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1);
|
||||
//cv::ocl::integral(gimg1, chk, chksq);
|
||||
//double r = cv::norm(chk,gimgroi,NORM_INF);
|
||||
//if(r > std::numeric_limits<double>::epsilon())
|
||||
//{
|
||||
// printf("failed");
|
||||
//}
|
||||
indexy += sz.height;
|
||||
}
|
||||
//int ystep = factor > 2 ? 1 : 2;
|
||||
// t = (double)cvGetTickCount() - t;
|
||||
//printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//t = (double)cvGetTickCount();
|
||||
|
||||
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
|
||||
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
|
||||
//int m,n;
|
||||
//m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep;
|
||||
//n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep;
|
||||
//int counter = m*n;
|
||||
|
||||
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
|
||||
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
|
||||
//if(flag == 0){
|
||||
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
|
||||
//memset((char*)candidate,0,4*sizeof(int)*outputsz);
|
||||
gpuSetImagesForHaarClassifierCascade( cascade,/* &sum1, &sqsum1, _tilted,*/ 1., gsum.step / 4 );
|
||||
|
||||
//cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status);
|
||||
//openCLVerifyCall(status);
|
||||
//openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL));
|
||||
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
|
||||
|
||||
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
|
||||
|
||||
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
|
||||
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
|
||||
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
|
||||
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
|
||||
//openCLVerifyCall(status);
|
||||
|
||||
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
//flag = 1;
|
||||
//}
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
|
||||
//size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1};
|
||||
//t = (double)cvGetTickCount();
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
int startnode = 0;
|
||||
@ -1086,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
pq.s[3] = gcascade->pq3;
|
||||
float correction = gcascade->inv_window_area;
|
||||
|
||||
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
|
||||
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
|
||||
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
|
||||
//openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer));
|
||||
|
||||
std::vector<std::pair<size_t, const void *> > args;
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
|
||||
@ -1110,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//t = (double)cvGetTickCount();
|
||||
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL));
|
||||
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
// t = (double)cvGetTickCount() - t;
|
||||
//printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//t = (double)cvGetTickCount();
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
|
||||
free(scaleinfo);
|
||||
free(candidate);
|
||||
//openCLSafeCall(clReleaseMemObject(cascadebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(nodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(candidatebuffer));
|
||||
// openCLSafeCall(clReleaseKernel(kernel));
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1149,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
cl_mem stagebuffer;
|
||||
//cl_mem classifierbuffer;
|
||||
cl_mem nodebuffer;
|
||||
cl_mem candidatebuffer;
|
||||
cl_mem scaleinfobuffer;
|
||||
@ -1184,24 +1109,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
size_t blocksize = 8;
|
||||
size_t localThreads[3] = { blocksize, blocksize , 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
|
||||
localThreads[1], 1
|
||||
};
|
||||
localThreads[1], 1 };
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
|
||||
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
|
||||
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
|
||||
loopcount * nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
//cl_kernel kernel;
|
||||
//kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2");
|
||||
//cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier");
|
||||
for(int i = 0; i < loopcount; i++)
|
||||
{
|
||||
sz = sizev[i];
|
||||
@ -1220,7 +1141,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep;
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
//outputsz +=width*height;
|
||||
|
||||
scaleinfo[i].width_height = (width << 16) | height;
|
||||
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
scaleinfo[i].imgoff = 0;
|
||||
@ -1238,28 +1159,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
size_t globalThreads2[3] = {nodenum, 1, 1};
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
|
||||
|
||||
//clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL);
|
||||
//clFinish(gsum.clCxt->impl->clCmdQueue);
|
||||
}
|
||||
//clReleaseKernel(kernel2);
|
||||
|
||||
int step = gsum.step / 4;
|
||||
int startnode = 0;
|
||||
int splitstage = 3;
|
||||
int splitnode = stage[0].count + stage[1].count + stage[2].count;
|
||||
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
|
||||
//openCLVerifyCall(status);
|
||||
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
|
||||
correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
|
||||
//int argcount = 0;
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
|
||||
|
||||
std::vector<std::pair<size_t, const void *> > args;
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
|
||||
@ -1268,22 +1181,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&step ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));
|
||||
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
|
||||
candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
|
||||
candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
{
|
||||
@ -1294,7 +1206,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
free(scaleinfo);
|
||||
free(p);
|
||||
free(correction);
|
||||
clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0);
|
||||
clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0);
|
||||
openCLSafeCall(clReleaseMemObject(stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(nodebuffer));
|
||||
@ -1303,20 +1215,547 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
openCLSafeCall(clReleaseMemObject(pbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(correctionbuffer));
|
||||
}
|
||||
//t = (double)cvGetTickCount() ;
|
||||
|
||||
cvFree(&cascade->hid_cascade);
|
||||
// printf("%d\n",globalcounter);
|
||||
rectList.resize(allCandidates.size());
|
||||
if(!allCandidates.empty())
|
||||
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
|
||||
|
||||
//cout << "count = " << rectList.size()<< endl;
|
||||
|
||||
if( minNeighbors != 0 || findBiggestObject )
|
||||
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
|
||||
else
|
||||
rweights.resize(rectList.size(), 0);
|
||||
|
||||
if( findBiggestObject && rectList.size() )
|
||||
{
|
||||
CvAvgComp result_comp = {{0, 0, 0, 0}, 0};
|
||||
|
||||
for( size_t i = 0; i < rectList.size(); i++ )
|
||||
{
|
||||
cv::Rect r = rectList[i];
|
||||
if( r.area() > cv::Rect(result_comp.rect).area() )
|
||||
{
|
||||
result_comp.rect = r;
|
||||
result_comp.neighbors = rweights[i];
|
||||
}
|
||||
}
|
||||
cvSeqPush( result_seq, &result_comp );
|
||||
}
|
||||
else
|
||||
{
|
||||
for( size_t i = 0; i < rectList.size(); i++ )
|
||||
{
|
||||
CvAvgComp c;
|
||||
c.rect = rectList[i];
|
||||
c.neighbors = rweights[i];
|
||||
cvSeqPush( result_seq, &c );
|
||||
}
|
||||
}
|
||||
|
||||
return result_seq;
|
||||
}
|
||||
|
||||
struct OclBuffers
|
||||
{
|
||||
cl_mem stagebuffer;
|
||||
cl_mem nodebuffer;
|
||||
cl_mem candidatebuffer;
|
||||
cl_mem scaleinfobuffer;
|
||||
cl_mem pbuffer;
|
||||
cl_mem correctionbuffer;
|
||||
cl_mem newnodebuffer;
|
||||
};
|
||||
|
||||
struct getRect
|
||||
{
|
||||
Rect operator()(const CvAvgComp &e) const
|
||||
{
|
||||
return e.rect;
|
||||
}
|
||||
};
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv::Rect>& faces,
|
||||
double scaleFactor, int minNeighbors, int flags,
|
||||
Size minSize, Size maxSize)
|
||||
{
|
||||
int blocksize = 8;
|
||||
int grp_per_CU = 12;
|
||||
size_t localThreads[3] = { blocksize, blocksize, 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
|
||||
localThreads[1],
|
||||
1 };
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
|
||||
Init(gimg.rows, gimg.cols, scaleFactor, flags, outputsz, localThreads, minSize, maxSize);
|
||||
|
||||
const double GROUP_EPS = 0.2;
|
||||
|
||||
cv::ConcurrentRectVector allCandidates;
|
||||
std::vector<cv::Rect> rectList;
|
||||
std::vector<int> rweights;
|
||||
|
||||
CvHaarClassifierCascade *cascade = oldCascade;
|
||||
GpuHidHaarClassifierCascade *gcascade;
|
||||
GpuHidHaarStageClassifier *stage;
|
||||
GpuHidHaarClassifier *classifier;
|
||||
GpuHidHaarTreeNode *node;
|
||||
|
||||
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
|
||||
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
|
||||
|
||||
if( CV_MAT_CN(gimg.type()) > 1 )
|
||||
{
|
||||
oclMat gtemp;
|
||||
cvtColor( gimg, gtemp, CV_BGR2GRAY );
|
||||
gimg = gtemp;
|
||||
}
|
||||
|
||||
int *candidate;
|
||||
|
||||
if( (flags & CV_HAAR_SCALE_IMAGE) )
|
||||
{
|
||||
int indexy = 0;
|
||||
CvSize sz;
|
||||
|
||||
cv::Rect roi, roi2;
|
||||
cv::Mat imgroi, imgroisq;
|
||||
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
|
||||
|
||||
for( int i = 0; i < m_loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
roi = Rect(0, indexy, sz.width, sz.height);
|
||||
roi2 = Rect(0, 0, sz.width - 1, sz.height - 1);
|
||||
resizeroi = gimg1(roi2);
|
||||
gimgroi = gsum(roi);
|
||||
gimgroisq = gsqsum(roi);
|
||||
|
||||
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
|
||||
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
|
||||
indexy += sz.height;
|
||||
}
|
||||
|
||||
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
|
||||
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
|
||||
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
|
||||
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
|
||||
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
|
||||
stage, 0, NULL, NULL));
|
||||
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
|
||||
m_nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
int startnode = 0;
|
||||
int pixelstep = gsum.step / 4;
|
||||
int splitstage = 3;
|
||||
int splitnode = stage[0].count + stage[1].count + stage[2].count;
|
||||
cl_int4 p, pq;
|
||||
p.s[0] = gcascade->p0;
|
||||
p.s[1] = gcascade->p1;
|
||||
p.s[2] = gcascade->p2;
|
||||
p.s[3] = gcascade->p3;
|
||||
pq.s[0] = gcascade->pq0;
|
||||
pq.s[1] = gcascade->pq1;
|
||||
pq.s[2] = gcascade->pq2;
|
||||
pq.s[3] = gcascade->pq3;
|
||||
float correction = gcascade->inv_window_area;
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
|
||||
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
|
||||
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
|
||||
memset(candidate, 0, 4 * sizeof(int) * outputsz);
|
||||
openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
|
||||
free((void *)candidate);
|
||||
candidate = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::ocl::integral(gimg, gsum, gsqsum);
|
||||
|
||||
gpuSetHaarClassifierCascade(cascade);
|
||||
|
||||
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
|
||||
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
|
||||
m_nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
|
||||
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
|
||||
float *correction = (float *)malloc(sizeof(float) * m_loopcount);
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
double factor;
|
||||
for(int i = 0; i < m_loopcount; i++)
|
||||
{
|
||||
factor = scalev[i];
|
||||
int equRect_x = (int)(factor * gcascade->p0 + 0.5);
|
||||
int equRect_y = (int)(factor * gcascade->p1 + 0.5);
|
||||
int equRect_w = (int)(factor * gcascade->p3 + 0.5);
|
||||
int equRect_h = (int)(factor * gcascade->p2 + 0.5);
|
||||
p[i].s[0] = equRect_x;
|
||||
p[i].s[1] = equRect_y;
|
||||
p[i].s[2] = equRect_x + equRect_w;
|
||||
p[i].s[3] = equRect_y + equRect_h;
|
||||
correction[i] = 1. / (equRect_w * equRect_h);
|
||||
int startnodenum = m_nodenum * i;
|
||||
float factor2 = (float)factor;
|
||||
|
||||
vector<pair<size_t, const void *> > args1;
|
||||
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
|
||||
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
|
||||
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 ));
|
||||
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] ));
|
||||
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
|
||||
|
||||
size_t globalThreads2[3] = {m_nodenum, 1, 1};
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
|
||||
}
|
||||
|
||||
int step = gsum.step / 4;
|
||||
int startnode = 0;
|
||||
int splitstage = 3;
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
{
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
}
|
||||
|
||||
free(p);
|
||||
free(correction);
|
||||
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
|
||||
}
|
||||
|
||||
rectList.resize(allCandidates.size());
|
||||
if(!allCandidates.empty())
|
||||
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
|
||||
|
||||
if( minNeighbors != 0 || findBiggestObject )
|
||||
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
|
||||
else
|
||||
rweights.resize(rectList.size(), 0);
|
||||
|
||||
GenResult(faces, rectList, rweights);
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
|
||||
double scaleFactor, int flags,
|
||||
const int outputsz, const size_t localThreads[],
|
||||
CvSize minSize, CvSize maxSize)
|
||||
{
|
||||
CvHaarClassifierCascade *cascade = oldCascade;
|
||||
|
||||
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
|
||||
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
|
||||
|
||||
if( scaleFactor <= 1 )
|
||||
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
|
||||
|
||||
if( cols < minSize.width || rows < minSize.height )
|
||||
CV_Error(CV_StsError, "Image too small");
|
||||
|
||||
int datasize=0;
|
||||
int totalclassifier=0;
|
||||
|
||||
if( !cascade->hid_cascade )
|
||||
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
|
||||
|
||||
if( maxSize.height == 0 || maxSize.width == 0 )
|
||||
{
|
||||
maxSize.height = rows;
|
||||
maxSize.width = cols;
|
||||
}
|
||||
|
||||
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
|
||||
if( findBiggestObject )
|
||||
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
|
||||
|
||||
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
|
||||
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
|
||||
|
||||
m_scaleFactor = scaleFactor;
|
||||
m_rows = rows;
|
||||
m_cols = cols;
|
||||
m_flags = flags;
|
||||
m_minSize = minSize;
|
||||
m_maxSize = maxSize;
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::CreateBaseBufs(const int datasize, const int totalclassifier,
|
||||
const int flags, const int outputsz)
|
||||
{
|
||||
if (!initialized)
|
||||
{
|
||||
buffers = malloc(sizeof(OclBuffers));
|
||||
|
||||
size_t tempSize =
|
||||
sizeof(GpuHidHaarStageClassifier) * ((GpuHidHaarClassifierCascade *)oldCascade->hid_cascade)->count;
|
||||
m_nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - tempSize - sizeof(GpuHidHaarClassifier) * totalclassifier)
|
||||
/ sizeof(GpuHidHaarTreeNode);
|
||||
|
||||
((OclBuffers *)buffers)->stagebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, tempSize);
|
||||
((OclBuffers *)buffers)->nodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, m_nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
}
|
||||
|
||||
if (initialized
|
||||
&& ((m_flags & CV_HAAR_SCALE_IMAGE) ^ (flags & CV_HAAR_SCALE_IMAGE)))
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
|
||||
}
|
||||
|
||||
if (flags & CV_HAAR_SCALE_IMAGE)
|
||||
{
|
||||
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
|
||||
CL_MEM_WRITE_ONLY,
|
||||
4 * sizeof(int) * outputsz);
|
||||
}
|
||||
else
|
||||
{
|
||||
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
|
||||
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
||||
4 * sizeof(int) * outputsz);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
|
||||
const int rows, const int cols, const int flags,
|
||||
const double scaleFactor, const size_t localThreads[],
|
||||
CvSize minSize, CvSize maxSize)
|
||||
{
|
||||
if (initialized)
|
||||
{
|
||||
if ((m_flags & CV_HAAR_SCALE_IMAGE) && !(flags & CV_HAAR_SCALE_IMAGE))
|
||||
{
|
||||
gimg1.release();
|
||||
gsum.release();
|
||||
gsqsum.release();
|
||||
}
|
||||
else if (!(m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
|
||||
}
|
||||
else if ((m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
|
||||
{
|
||||
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
|
||||
&& (rows == m_rows && cols == m_cols)
|
||||
&& (minSize.width == m_minSize.width)
|
||||
&& (minSize.height == m_minSize.height)
|
||||
&& (maxSize.width == m_maxSize.width)
|
||||
&& (maxSize.height == m_maxSize.height))
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
|
||||
&& (rows == m_rows && cols == m_cols)
|
||||
&& (minSize.width == m_minSize.width)
|
||||
&& (minSize.height == m_minSize.height)
|
||||
&& (maxSize.width == m_maxSize.width)
|
||||
&& (maxSize.height == m_maxSize.height))
|
||||
{
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int loopcount;
|
||||
int indexy = 0;
|
||||
int totalheight = 0;
|
||||
double factor;
|
||||
Rect roi;
|
||||
CvSize sz;
|
||||
CvSize winSize0 = oldCascade->orig_window_size;
|
||||
detect_piramid_info *scaleinfo;
|
||||
if (flags & CV_HAAR_SCALE_IMAGE)
|
||||
{
|
||||
for(factor = 1.f;; factor *= scaleFactor)
|
||||
{
|
||||
CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) };
|
||||
sz.width = cvRound( cols / factor ) + 1;
|
||||
sz.height = cvRound( rows / factor ) + 1;
|
||||
CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 };
|
||||
|
||||
if( sz1.width <= 0 || sz1.height <= 0 )
|
||||
break;
|
||||
if( winSize.width > maxSize.width || winSize.height > maxSize.height )
|
||||
break;
|
||||
if( winSize.width < minSize.width || winSize.height < minSize.height )
|
||||
continue;
|
||||
|
||||
totalheight += sz.height;
|
||||
sizev.push_back(sz);
|
||||
scalev.push_back(static_cast<float>(factor));
|
||||
}
|
||||
|
||||
loopcount = sizev.size();
|
||||
gimg1.create(rows, cols, CV_8UC1);
|
||||
gsum.create(totalheight + 4, cols + 1, CV_32SC1);
|
||||
gsqsum.create(totalheight + 4, cols + 1, CV_32FC1);
|
||||
|
||||
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
|
||||
for( int i = 0; i < loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
roi = Rect(0, indexy, sz.width, sz.height);
|
||||
int width = sz.width - 1 - oldCascade->orig_window_size.width;
|
||||
int height = sz.height - 1 - oldCascade->orig_window_size.height;
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
|
||||
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
|
||||
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
((detect_piramid_info *)scaleinfo)[i].imgoff = gsum(roi).offset >> 2;
|
||||
((detect_piramid_info *)scaleinfo)[i].factor = scalev[i];
|
||||
|
||||
indexy += sz.height;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(factor = 1;
|
||||
cvRound(factor * winSize0.width) < cols - 10 && cvRound(factor * winSize0.height) < rows - 10;
|
||||
factor *= scaleFactor)
|
||||
{
|
||||
CvSize winSize = { cvRound( winSize0.width * factor ), cvRound( winSize0.height * factor ) };
|
||||
if( winSize.width < minSize.width || winSize.height < minSize.height )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
sizev.push_back(winSize);
|
||||
scalev.push_back(factor);
|
||||
}
|
||||
|
||||
loopcount = scalev.size();
|
||||
if(loopcount == 0)
|
||||
{
|
||||
loopcount = 1;
|
||||
sizev.push_back(minSize);
|
||||
scalev.push_back( min(cvRound(minSize.width / winSize0.width), cvRound(minSize.height / winSize0.height)) );
|
||||
}
|
||||
|
||||
((OclBuffers *)buffers)->pbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
|
||||
sizeof(cl_int4) * loopcount);
|
||||
((OclBuffers *)buffers)->correctionbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
|
||||
sizeof(cl_float) * loopcount);
|
||||
((OclBuffers *)buffers)->newnodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_WRITE,
|
||||
loopcount * m_nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
|
||||
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
|
||||
for( int i = 0; i < loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
factor = scalev[i];
|
||||
int ystep = cvRound(std::max(2., factor));
|
||||
int width = (cols - 1 - sz.width + ystep - 1) / ystep;
|
||||
int height = (rows - 1 - sz.height + ystep - 1) / ystep;
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
|
||||
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
|
||||
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
((detect_piramid_info *)scaleinfo)[i].imgoff = 0;
|
||||
((detect_piramid_info *)scaleinfo)[i].factor = factor;
|
||||
}
|
||||
}
|
||||
|
||||
if (loopcount != m_loopcount)
|
||||
{
|
||||
if (initialized)
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
|
||||
}
|
||||
((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
}
|
||||
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
|
||||
sizeof(detect_piramid_info)*loopcount,
|
||||
scaleinfo, 0, NULL, NULL));
|
||||
free(scaleinfo);
|
||||
|
||||
m_loopcount = loopcount;
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& faces,
|
||||
const std::vector<cv::Rect> &rectList,
|
||||
const std::vector<int> &rweights)
|
||||
{
|
||||
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
|
||||
|
||||
if( findBiggestObject && rectList.size() )
|
||||
{
|
||||
@ -1343,13 +1782,34 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
cvSeqPush( result_seq, &c );
|
||||
}
|
||||
}
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//alltime = (double)cvGetTickCount() - alltime;
|
||||
//printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.) );
|
||||
return result_seq;
|
||||
|
||||
vector<CvAvgComp> vecAvgComp;
|
||||
Seq<CvAvgComp>(result_seq).copyTo(vecAvgComp);
|
||||
faces.resize(vecAvgComp.size());
|
||||
std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::release()
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
|
||||
|
||||
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
|
||||
{
|
||||
cvFree(&oldCascade->hid_cascade);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
|
||||
}
|
||||
|
||||
free(buffers);
|
||||
buffers = NULL;
|
||||
}
|
||||
|
||||
#ifndef _MAX_PATH
|
||||
#define _MAX_PATH 1024
|
||||
|
@ -1012,10 +1012,8 @@ namespace cv
|
||||
warpPerspective_gpu(src, dst, coeffs, interpolation);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// integral
|
||||
|
||||
void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
|
||||
{
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
@ -1029,13 +1027,24 @@ namespace cv
|
||||
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
|
||||
|
||||
oclMat t_sum , t_sqsum;
|
||||
t_sum.create(src.cols, src.rows, CV_32SC1);
|
||||
t_sqsum.create(src.cols, src.rows, CV_32FC1);
|
||||
|
||||
int w = src.cols + 1, h = src.rows + 1;
|
||||
sum.create(h, w, CV_32SC1);
|
||||
int depth;
|
||||
if( src.cols * src.rows <= 2901 * 2901 ) //2901 is the maximum size for int when all values are 255
|
||||
{
|
||||
t_sum.create(src.cols, src.rows, CV_32SC1);
|
||||
sum.create(h, w, CV_32SC1);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Use float to prevent overflow
|
||||
t_sum.create(src.cols, src.rows, CV_32FC1);
|
||||
sum.create(h, w, CV_32FC1);
|
||||
}
|
||||
t_sqsum.create(src.cols, src.rows, CV_32FC1);
|
||||
sqsum.create(h, w, CV_32FC1);
|
||||
int sum_offset = sum.offset / vlen, sqsum_offset = sqsum.offset / vlen;
|
||||
depth = sum.depth();
|
||||
int sum_offset = sum.offset / vlen;
|
||||
int sqsum_offset = sqsum.offset / vlen;
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
|
||||
@ -1048,7 +1057,7 @@ namespace cv
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
|
||||
size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, -1);
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
|
||||
args.clear();
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
|
||||
@ -1062,9 +1071,9 @@ namespace cv
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
|
||||
size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1};
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, -1);
|
||||
//std::cout << "tested" << std::endl;
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
|
||||
}
|
||||
|
||||
void integral(const oclMat &src, oclMat &sum)
|
||||
{
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
@ -1074,10 +1083,18 @@ namespace cv
|
||||
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
|
||||
|
||||
oclMat t_sum;
|
||||
t_sum.create(src.cols, src.rows, CV_32SC1);
|
||||
|
||||
int w = src.cols + 1, h = src.rows + 1;
|
||||
sum.create(h, w, CV_32SC1);
|
||||
int depth;
|
||||
if(src.cols * src.rows <= 2901 * 2901)
|
||||
{
|
||||
t_sum.create(src.cols, src.rows, CV_32SC1);
|
||||
sum.create(h, w, CV_32SC1);
|
||||
}else
|
||||
{
|
||||
t_sum.create(src.cols, src.rows, CV_32FC1);
|
||||
sum.create(h, w, CV_32FC1);
|
||||
}
|
||||
depth = sum.depth();
|
||||
int sum_offset = sum.offset / vlen;
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
@ -1090,7 +1107,7 @@ namespace cv
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
|
||||
size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, -1);
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
|
||||
args.clear();
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
|
||||
@ -1100,7 +1117,7 @@ namespace cv
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
|
||||
size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1};
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, -1);
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);
|
||||
//std::cout << "tested" << std::endl;
|
||||
}
|
||||
|
||||
|
@ -128,6 +128,8 @@ namespace cv
|
||||
std::vector<cl_device_id> devices;
|
||||
std::vector<String> devName;
|
||||
String platName;
|
||||
String clVersion;
|
||||
|
||||
cl_context oclcontext;
|
||||
cl_command_queue clCmdQueue;
|
||||
int devnum;
|
||||
@ -260,7 +262,7 @@ namespace cv
|
||||
|
||||
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
|
||||
{
|
||||
if( (mem_type == DEVICE_MEM_PM &&
|
||||
if( (mem_type == DEVICE_MEM_PM &&
|
||||
Context::getContext()->impl->unified_memory == 0) )
|
||||
return -1;
|
||||
gDeviceMemRW = rw_type;
|
||||
@ -303,6 +305,7 @@ namespace cv
|
||||
const static int max_name_length = 256;
|
||||
char deviceName[max_name_length];
|
||||
char plfmName[max_name_length];
|
||||
char clVersion[256];
|
||||
for (unsigned i = 0; i < numPlatforms; ++i)
|
||||
{
|
||||
|
||||
@ -322,6 +325,8 @@ namespace cv
|
||||
ocltmpinfo.PlatformName = String(plfmName);
|
||||
ocltmpinfo.impl->platName = String(plfmName);
|
||||
ocltmpinfo.impl->oclplatform = platforms[i];
|
||||
openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
|
||||
ocltmpinfo.impl->clVersion = clVersion;
|
||||
for(unsigned j = 0; j < numsdev; ++j)
|
||||
{
|
||||
ocltmpinfo.impl->devices.push_back(devices[j]);
|
||||
@ -424,13 +429,13 @@ namespace cv
|
||||
}
|
||||
|
||||
void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
|
||||
size_t widthInBytes, size_t height,
|
||||
size_t widthInBytes, size_t height,
|
||||
DevMemRW rw_type, DevMemType mem_type, void* hptr)
|
||||
{
|
||||
cl_int status;
|
||||
if(hptr && (mem_type==DEVICE_MEM_UHP || mem_type==DEVICE_MEM_CHP))
|
||||
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext,
|
||||
gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext,
|
||||
gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
widthInBytes * height, hptr, &status);
|
||||
else
|
||||
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
@ -985,6 +990,8 @@ namespace cv
|
||||
return impl->double_support == 1;
|
||||
case CL_UNIFIED_MEM:
|
||||
return impl->unified_memory == 1;
|
||||
case CL_VER_1_2:
|
||||
return impl->clVersion.find("OpenCL 1.2") != String::npos;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@ -196,7 +196,7 @@ void cv::ocl::oclMat::upload(const Mat &m)
|
||||
// try to use host ptr
|
||||
createEx(wholeSize, m.type(), gDeviceMemRW, gDeviceMemType, m.datastart);
|
||||
if(gDeviceMemType!=DEVICE_MEM_UHP && gDeviceMemType!=DEVICE_MEM_CHP)
|
||||
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step,
|
||||
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step,
|
||||
wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
@ -571,11 +571,16 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, Stri
|
||||
CV_Error(Error::StsUnsupportedFormat, "unknown depth");
|
||||
}
|
||||
#ifdef CL_VERSION_1_2
|
||||
if(dst.offset == 0 && dst.cols == dst.wholecols)
|
||||
//this enables backwards portability to
|
||||
//run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
|
||||
if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
|
||||
dst.offset == 0 && dst.cols == dst.wholecols)
|
||||
{
|
||||
clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
|
||||
clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
|
||||
(cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
|
||||
@ -583,17 +588,8 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, Stri
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
|
||||
openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1, compile_option);
|
||||
localThreads, args, -1, -1, compile_option);
|
||||
}
|
||||
#else
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
|
||||
openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1, compile_option);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName)
|
||||
@ -887,7 +883,7 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
|
||||
|
||||
}
|
||||
|
||||
void cv::ocl::oclMat::createEx(Size size, int type,
|
||||
void cv::ocl::oclMat::createEx(Size size, int type,
|
||||
DevMemRW rw_type, DevMemType mem_type, void* hptr)
|
||||
{
|
||||
createEx(size.height, size.width, type, rw_type, mem_type, hptr);
|
||||
@ -898,7 +894,7 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
|
||||
createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
|
||||
}
|
||||
|
||||
void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
|
||||
void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
|
||||
DevMemRW rw_type, DevMemType mem_type, void* hptr)
|
||||
{
|
||||
clCxt = Context::getContext();
|
||||
@ -919,7 +915,7 @@ void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
|
||||
size_t esz = elemSize();
|
||||
|
||||
void *dev_ptr;
|
||||
openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols),
|
||||
openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols),
|
||||
rows, rw_type, mem_type, hptr);
|
||||
|
||||
if (esz * cols == step)
|
||||
|
@ -43,11 +43,10 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
|
||||
#include "precomp.hpp"
|
||||
|
||||
#ifndef CL_VERSION_1_2
|
||||
#define CL_VERSION_1_2 0
|
||||
#endif
|
||||
using namespace std;
|
||||
|
||||
namespace cv
|
||||
{
|
||||
@ -160,30 +159,44 @@ namespace cv
|
||||
CV_Error(-1, "Image forma is not supported");
|
||||
break;
|
||||
}
|
||||
#if CL_VERSION_1_2
|
||||
cl_image_desc desc;
|
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc.image_width = mat.cols;
|
||||
desc.image_height = mat.rows;
|
||||
desc.image_depth = 0;
|
||||
desc.image_array_size = 1;
|
||||
desc.image_row_pitch = 0;
|
||||
desc.image_slice_pitch = 0;
|
||||
desc.buffer = NULL;
|
||||
desc.num_mip_levels = 0;
|
||||
desc.num_samples = 0;
|
||||
texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
||||
#else
|
||||
texture = clCreateImage2D(
|
||||
(cl_context)mat.clCxt->oclContext(),
|
||||
CL_MEM_READ_WRITE,
|
||||
&format,
|
||||
mat.cols,
|
||||
mat.rows,
|
||||
0,
|
||||
NULL,
|
||||
&err);
|
||||
#ifdef CL_VERSION_1_2
|
||||
//this enables backwards portability to
|
||||
//run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
|
||||
if(Context::getContext()->supportsFeature(Context::CL_VER_1_2))
|
||||
{
|
||||
cl_image_desc desc;
|
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc.image_width = mat.cols;
|
||||
desc.image_height = mat.rows;
|
||||
desc.image_depth = 0;
|
||||
desc.image_array_size = 1;
|
||||
desc.image_row_pitch = 0;
|
||||
desc.image_slice_pitch = 0;
|
||||
desc.buffer = NULL;
|
||||
desc.num_mip_levels = 0;
|
||||
desc.num_samples = 0;
|
||||
texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
texture = clCreateImage2D(
|
||||
(cl_context)mat.clCxt->oclContext(),
|
||||
CL_MEM_READ_WRITE,
|
||||
&format,
|
||||
mat.cols,
|
||||
mat.rows,
|
||||
0,
|
||||
NULL,
|
||||
&err);
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
}
|
||||
size_t origin[] = { 0, 0, 0 };
|
||||
size_t region[] = { mat.cols, mat.rows, 1 };
|
||||
|
||||
@ -196,7 +209,7 @@ namespace cv
|
||||
clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
|
||||
regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
|
||||
clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
devData = (cl_mem)mat.data;
|
||||
@ -212,7 +225,6 @@ namespace cv
|
||||
openCLSafeCall(err);
|
||||
return texture;
|
||||
}
|
||||
|
||||
void releaseTexture(cl_mem& texture)
|
||||
{
|
||||
openCLFree(texture);
|
||||
|
@ -330,7 +330,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
|
||||
mom->m12 = dstsum[8];
|
||||
mom->m03 = dstsum[9];
|
||||
delete [] dstsum;
|
||||
|
||||
openCLSafeCall(clReleaseMemObject(sum));
|
||||
icvCompleteMomentState( mom );
|
||||
}
|
||||
|
||||
|
@ -1,966 +0,0 @@
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************and with scalar without mask**************************************/
|
||||
__kernel void arithm_s_bitwise_and_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
||||
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort2 src2_data = (ushort2)(src2.x, src2.x);
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
||||
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short2 src2_data = (short2)(src2.x, src2.x);
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
||||
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||
int src_data2 = src2.x;
|
||||
|
||||
int data = src_data1 & src_data2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
|
||||
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
short4 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
||||
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
|
||||
|
||||
ushort2 data = src_data1 & src_data2;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
||||
short2 src_data2 = (short2)(src2.x, src2.y);
|
||||
|
||||
short2 data = src_data1 & src_data2;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
|
||||
int2 data = src_data1 & src_data2;
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
char8 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
|
||||
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
|
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
short8 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0));
|
||||
char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1));
|
||||
char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
|
||||
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
|
||||
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
|
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
short2 src2_data_0 = (short2)(src2.x, src2.y);
|
||||
short2 src2_data_1 = (short2)(src2.z, src2.x);
|
||||
short2 src2_data_2 = (short2)(src2.y, src2.z);
|
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
short2 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short2 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short2 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_1 = src2.y;
|
||||
int src2_data_2 = src2.z;
|
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
int tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
int tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
|
||||
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
|
||||
|
||||
uchar4 data = src_data1 & src2;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
|
||||
|
||||
char4 data = src_data1 & src2;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
ushort4 data = src_data1 & src2;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
short4 data = src_data1 & src2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
|
||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
int4 data = src_data1 & src2;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
|
||||
char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
|
||||
char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
|
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
char16 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 & src2_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -50,11 +51,17 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_and without mask**************************************/
|
||||
__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
//bitwise_binary without mask for and, or, xor operators
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////bitwise_binary///////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef OP_BINARY
|
||||
#define OP_BINARY &
|
||||
#endif
|
||||
|
||||
__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -95,7 +102,7 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
@ -107,7 +114,7 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -148,7 +155,7 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
@ -160,7 +167,7 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -202,7 +209,7 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
ushort4 tmp_data = src1_data & src2_data;
|
||||
ushort4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
@ -215,7 +222,7 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -257,7 +264,7 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
short4 tmp_data = src1_data & src2_data;
|
||||
short4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
@ -270,7 +277,7 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -286,13 +293,13 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
|
||||
|
||||
int data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||
int data2 = *((__global int *)((__global char *)src2 + src2_index));
|
||||
int tmp = data1 & data2;
|
||||
int tmp = data1 OP_BINARY data2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -308,14 +315,14 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
|
||||
|
||||
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
|
||||
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
|
||||
char4 tmp = data1 & data2;
|
||||
char4 tmp = data1 OP_BINARY data2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
@ -332,7 +339,7 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
|
||||
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data1 & data2;
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2;
|
||||
}
|
||||
}
|
||||
#endif
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -49,11 +50,16 @@
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef OP_BINARY
|
||||
#define OP_BINARY &
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_and with mask**************************************/
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D0 (
|
||||
////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_binary with mask**************************************/
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -85,7 +91,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -98,7 +104,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D1 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -130,7 +136,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -143,7 +149,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D2 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -161,7 +167,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@ -175,7 +181,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data & src2_data;
|
||||
ushort2 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -186,7 +192,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D3 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -204,7 +210,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@ -218,7 +224,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
short2 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -229,7 +235,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D4 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -253,7 +259,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (
|
||||
int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 & src_data2;
|
||||
int data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
@ -262,7 +268,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D5 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -286,7 +292,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (
|
||||
char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index));
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src_data1 & src_data2;
|
||||
char4 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
@ -295,7 +301,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C1_D6 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C1_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -319,7 +325,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (
|
||||
char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 & src_data2;
|
||||
char8 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
@ -329,7 +335,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D0 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -347,7 +353,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@ -361,7 +367,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
||||
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
|
||||
@ -371,7 +377,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D1 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -389,7 +395,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
@ -403,7 +409,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
||||
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
|
||||
@ -412,7 +418,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D2 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -436,13 +442,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (
|
||||
ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 & src_data2;
|
||||
ushort2 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D3 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -466,13 +472,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (
|
||||
short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 & src_data2;
|
||||
short2 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D4 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -496,13 +502,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (
|
||||
int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 & src_data2;
|
||||
int2 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D5 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -526,14 +532,14 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (
|
||||
char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src_data1 & src_data2;
|
||||
char8 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C2_D6 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C2_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -557,7 +563,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (
|
||||
char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 & src_data2;
|
||||
char16 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
@ -565,398 +571,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
|
||||
uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
|
||||
uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = vload4(0, src2 + src2_index + 0);
|
||||
char4 src2_data_1 = vload4(0, src2 + src2_index + 4);
|
||||
char4 src2_data_2 = vload4(0, src2 + src2_index + 8);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
|
||||
ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
|
||||
ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 & src2_data_0 ;
|
||||
ushort2 tmp_data_1 = src1_data_1 & src2_data_1 ;
|
||||
ushort2 tmp_data_2 = src1_data_2 & src2_data_2 ;
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
|
||||
short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
|
||||
short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
short2 tmp_data_0 = src1_data_0 & src2_data_0 ;
|
||||
short2 tmp_data_1 = src1_data_1 & src2_data_1 ;
|
||||
short2 tmp_data_2 = src1_data_2 & src2_data_2 ;
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
|
||||
int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
|
||||
int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
int tmp_data_0 = src1_data_0 & src2_data_0 ;
|
||||
int tmp_data_1 = src1_data_1 & src2_data_1 ;
|
||||
int tmp_data_2 = src1_data_2 & src2_data_2 ;
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
|
||||
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
|
||||
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_with_mask_C3_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
|
||||
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
|
||||
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
|
||||
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
|
||||
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 ));
|
||||
char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 ));
|
||||
char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
|
||||
|
||||
char8 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char8 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char8 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D0 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -980,7 +595,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (
|
||||
uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 & src_data2;
|
||||
uchar4 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
@ -988,7 +603,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D1 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1012,14 +627,14 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (
|
||||
char4 src_data2 = *((__global char4 *)(src2 + src2_index));
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 & src_data2;
|
||||
char4 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D2 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1043,13 +658,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (
|
||||
ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 & src_data2;
|
||||
ushort4 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D3 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1073,13 +688,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (
|
||||
short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 & src_data2;
|
||||
short4 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D4 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1103,13 +718,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (
|
||||
int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 & src_data2;
|
||||
int4 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D5 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1133,14 +748,14 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (
|
||||
char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src_data1 & src_data2;
|
||||
char16 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_and_with_mask_C4_D6 (
|
||||
__kernel void arithm_bitwise_binary_with_mask_C4_D6 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1175,10 +790,10 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (
|
||||
char8 dst_data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
|
||||
char8 dst_data_3 = *((__global char8 *)((__global char *)dst + dst_index + 24));
|
||||
|
||||
char8 data_0 = src_data1_0 & src_data2_0;
|
||||
char8 data_1 = src_data1_1 & src_data2_1;
|
||||
char8 data_2 = src_data1_2 & src_data2_2;
|
||||
char8 data_3 = src_data1_3 & src_data2_3;
|
||||
char8 data_0 = src_data1_0 OP_BINARY src_data2_0;
|
||||
char8 data_1 = src_data1_1 OP_BINARY src_data2_1;
|
||||
char8 data_2 = src_data1_2 OP_BINARY src_data2_2;
|
||||
char8 data_3 = src_data1_3 OP_BINARY src_data2_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -49,11 +50,16 @@
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
|
||||
|
||||
#ifndef OP_BINARY
|
||||
#define OP_BINARY &
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************xor with scalar without mask**************************************/
|
||||
__kernel void arithm_s_bitwise_xor_C1_D0 (
|
||||
////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/******************************bitwise binary with scalar without mask********************************/
|
||||
__kernel void arithm_s_bitwise_binary_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
@ -79,7 +85,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (
|
||||
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data ^ src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -91,7 +97,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C1_D1 (
|
||||
__kernel void arithm_s_bitwise_binary_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
@ -117,7 +123,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (
|
||||
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data ^ src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -128,7 +134,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C1_D2 (
|
||||
__kernel void arithm_s_bitwise_binary_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
@ -155,7 +161,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (
|
||||
ushort2 src2_data = (ushort2)(src2.x, src2.x);
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data ^ src2_data;
|
||||
ushort2 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
||||
@ -163,7 +169,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C1_D3 (
|
||||
__kernel void arithm_s_bitwise_binary_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
@ -190,7 +196,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (
|
||||
short2 src2_data = (short2)(src2.x, src2.x);
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
|
||||
short2 tmp_data = src1_data ^ src2_data;
|
||||
short2 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
||||
@ -198,7 +204,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C1_D4 (
|
||||
__kernel void arithm_s_bitwise_binary_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
@ -215,12 +221,12 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (
|
||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||
int src_data2 = src2.x;
|
||||
|
||||
int data = src_data1 ^ src_data2;
|
||||
int data = src_data1 OP_BINARY src_data2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C1_D5 (
|
||||
__kernel void arithm_s_bitwise_binary_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
@ -241,7 +247,7 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (
|
||||
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
char4 tmp_data = src1_data ^ src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -251,9 +257,8 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C1_D6 (
|
||||
__kernel void arithm_s_bitwise_binary_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
@ -270,13 +275,13 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
short4 tmp_data = src1_data ^ src2_data;
|
||||
short4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_C2_D0 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
@ -303,7 +308,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (
|
||||
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data ^ src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
@ -314,7 +319,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C2_D1 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
@ -341,7 +346,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (
|
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data ^ src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
@ -350,7 +355,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C2_D2 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
@ -367,12 +372,12 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
|
||||
|
||||
ushort2 data = src_data1 ^ src_data2;
|
||||
ushort2 data = src_data1 OP_BINARY src_data2;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C2_D3 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
@ -389,12 +394,12 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
||||
short2 src_data2 = (short2)(src2.x, src2.y);
|
||||
|
||||
short2 data = src_data1 ^ src_data2;
|
||||
short2 data = src_data1 OP_BINARY src_data2;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C2_D4 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
@ -411,11 +416,11 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
|
||||
int2 data = src_data1 ^ src_data2;
|
||||
int2 data = src_data1 OP_BINARY src_data2;
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C2_D5 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
@ -432,13 +437,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (
|
||||
char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
char8 tmp_data = src1_data ^ src2_data;
|
||||
char8 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C2_D6 (
|
||||
__kernel void arithm_s_bitwise_binary_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
@ -455,347 +460,14 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (
|
||||
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
|
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
short8 tmp_data = src1_data ^ src2_data;
|
||||
short8 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
|
||||
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
|
||||
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
|
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
short2 src2_data_0 = (short2)(src2.x, src2.y);
|
||||
short2 src2_data_1 = (short2)(src2.z, src2.x);
|
||||
short2 src2_data_2 = (short2)(src2.y, src2.z);
|
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
short2 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
short2 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short2 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_1 = src2.y;
|
||||
int src2_data_2 = src2.z;
|
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
int tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
int tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
|
||||
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_xor_C4_D0 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
@ -811,14 +483,14 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (
|
||||
|
||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
|
||||
|
||||
uchar4 data = src_data1 ^ src2;
|
||||
uchar4 data = src_data1 OP_BINARY src2;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C4_D1 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
@ -834,13 +506,13 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (
|
||||
|
||||
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
|
||||
|
||||
char4 data = src_data1 ^ src2;
|
||||
char4 data = src_data1 OP_BINARY src2;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_xor_C4_D2 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
@ -856,12 +528,12 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (
|
||||
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
ushort4 data = src_data1 ^ src2;
|
||||
ushort4 data = src_data1 OP_BINARY src2;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C4_D3 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
@ -877,12 +549,12 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (
|
||||
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
short4 data = src_data1 ^ src2;
|
||||
short4 data = src_data1 OP_BINARY src2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C4_D4 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
@ -898,12 +570,12 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (
|
||||
|
||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
int4 data = src_data1 ^ src2;
|
||||
int4 data = src_data1 OP_BINARY src2;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_xor_C4_D5 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
@ -921,13 +593,13 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (
|
||||
char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
|
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
char16 tmp_data = src1_data ^ src2_data;
|
||||
char16 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_xor_C4_D6 (
|
||||
__kernel void arithm_s_bitwise_binary_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
@ -951,10 +623,10 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
|
||||
short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
@ -963,4 +635,4 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
@ -49,11 +49,16 @@
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef OP_BINARY
|
||||
#define OP_BINARY &
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_and with scalar with mask**************************************/
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
|
||||
////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_binary with scalar with mask**************************************/
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -83,7 +88,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -95,7 +100,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -125,7 +130,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
@ -136,7 +141,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -153,7 +158,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@ -166,7 +171,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data & src2_data;
|
||||
ushort2 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
|
||||
@ -174,7 +179,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -191,7 +196,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@ -204,7 +209,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
short2 tmp_data = src1_data & src2_data;
|
||||
short2 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
|
||||
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
|
||||
@ -212,7 +217,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -234,14 +239,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
|
||||
int src_data2 = src2.x;
|
||||
int dst_data = *((__global int *)((__global char *)dst + dst_index));
|
||||
|
||||
int data = src_data1 & src_data2;
|
||||
int data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -263,7 +268,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
|
||||
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
|
||||
|
||||
char4 data = src1_data & src2_data;
|
||||
char4 data = src1_data OP_BINARY src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
@ -271,7 +276,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -293,14 +298,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
|
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src1_data & src2_data;
|
||||
short4 data = src1_data OP_BINARY src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -317,7 +322,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@ -330,7 +335,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data & src2_data;
|
||||
uchar4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
||||
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
|
||||
@ -340,7 +345,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -357,7 +362,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
#define dst_align ((dst_offset / 2) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
@ -370,7 +375,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data & src2_data;
|
||||
char4 tmp_data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
|
||||
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
|
||||
@ -379,7 +384,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -401,13 +406,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
|
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
|
||||
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort2 data = src_data1 & src_data2;
|
||||
ushort2 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -429,13 +434,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
|
||||
short2 src_data2 = (short2)(src2.x, src2.y);
|
||||
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
|
||||
|
||||
short2 data = src_data1 & src_data2;
|
||||
short2 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -457,13 +462,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
|
||||
|
||||
int2 data = src_data1 & src_data2;
|
||||
int2 data = src_data1 OP_BINARY src_data2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -485,7 +490,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
|
||||
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
|
||||
|
||||
char8 data = src1_data & src2_data;
|
||||
char8 data = src1_data OP_BINARY src2_data;
|
||||
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
@ -493,7 +498,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -515,388 +520,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
|
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
|
||||
|
||||
short8 data = src1_data & src2_data;
|
||||
short8 data = src1_data OP_BINARY src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 mask_data = vload4(0, mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
|
||||
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
|
||||
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
short2 src2_data_0 = (short2)(src2.x, src2.y);
|
||||
short2 src2_data_1 = (short2)(src2.z, src2.x);
|
||||
short2 src2_data_2 = (short2)(src2.y, src2.z);
|
||||
|
||||
uchar2 mask_data = vload2(0, mask + mask_index);
|
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
short2 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short2 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short2 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_1 = src2.y;
|
||||
int src2_data_2 = src2.z;
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
int tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
int tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int mask_index = mad24(y, mask_step, x + mask_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
uchar mask_data = * (mask + mask_index);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
|
||||
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2;
|
||||
|
||||
data_0 = mask_data ? tmp_data_0 : data_0;
|
||||
data_1 = mask_data ? tmp_data_1 : data_1;
|
||||
data_2 = mask_data ? tmp_data_2 : data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -917,7 +548,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
|
||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
|
||||
uchar4 data = src_data1 & src2;
|
||||
uchar4 data = src_data1 OP_BINARY src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
@ -925,7 +556,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -946,14 +577,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
|
||||
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
|
||||
char4 data = src_data1 & src2;
|
||||
char4 data = src_data1 OP_BINARY src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -974,13 +605,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
|
||||
ushort4 data = src_data1 & src2;
|
||||
ushort4 data = src_data1 OP_BINARY src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1001,13 +632,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
short4 data = src_data1 & src2;
|
||||
short4 data = src_data1 OP_BINARY src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1028,13 +659,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
|
||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
|
||||
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
|
||||
|
||||
int4 data = src_data1 & src2;
|
||||
int4 data = src_data1 OP_BINARY src2;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1057,14 +688,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
|
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
|
||||
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
|
||||
|
||||
char16 data = src1_data & src2_data;
|
||||
char16 data = src1_data OP_BINARY src2_data;
|
||||
data = mask_data ? data : dst_data;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
|
||||
__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
__global uchar *mask, int mask_step, int mask_offset,
|
||||
@ -1097,10 +728,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
|
||||
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
short4 dst_data_3 = *((__global short4 *)((__global char *)dst + dst_index + 24));
|
||||
|
||||
short4 data_0 = src1_data_0 & src2_data_0;
|
||||
short4 data_1 = src1_data_1 & src2_data_1;
|
||||
short4 data_2 = src1_data_2 & src2_data_2;
|
||||
short4 data_3 = src1_data_3 & src2_data_3;
|
||||
short4 data_0 = src1_data_0 OP_BINARY src2_data_0;
|
||||
short4 data_1 = src1_data_1 OP_BINARY src2_data_1;
|
||||
short4 data_2 = src1_data_2 OP_BINARY src2_data_2;
|
||||
short4 data_3 = src1_data_3 OP_BINARY src2_data_3;
|
||||
|
||||
data_0 = mask_data ? data_0 : dst_data_0;
|
||||
data_1 = mask_data ? data_1 : dst_data_1;
|
@ -1,294 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_or without mask**************************************/
|
||||
__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data | src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = vload4(0, src2 + src2_index);
|
||||
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data | src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
ushort4 tmp_data = src1_data | src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
short4 tmp_data = src1_data | src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
int data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||
int data2 = *((__global int *)((__global char *)src2 + src2_index));
|
||||
int tmp = data1 | data2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
|
||||
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
|
||||
char4 tmp = data1 | data2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data1 | data2;
|
||||
}
|
||||
}
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -1,973 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************and with scalar without mask**************************************/
|
||||
__kernel void arithm_s_bitwise_or_C1_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
||||
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C1_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
|
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
|
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
|
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C1_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort2 src2_data = (ushort2)(src2.x, src2.x);
|
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
|
||||
ushort2 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
||||
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C1_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short2 src2_data = (short2)(src2.x, src2.x);
|
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
|
||||
|
||||
short2 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
|
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
|
||||
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C1_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||
int src_data2 = src2.x;
|
||||
|
||||
int data = src_data1 | src_data2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C1_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
|
||||
char4 src_data2 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
char4 data = src_data1 | src_data2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C1_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
|
||||
short4 tmp_data = src1_data | src2_data;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_C2_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index);
|
||||
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
|
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index);
|
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
|
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data | src2_data;
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
|
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
|
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
|
||||
|
||||
ushort2 data = src_data1 | src_data2;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C2_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
|
||||
short2 src_data2 = (short2)(src2.x, src2.y);
|
||||
|
||||
short2 data = src_data1 | src_data2;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
|
||||
int2 src_data2 = (int2)(src2.x, src2.y);
|
||||
|
||||
int2 data = src_data1 | src_data2;
|
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C2_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
char8 data = src_data1 | src_data2;
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C2_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
|
||||
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
|
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
|
||||
short8 tmp_data = src1_data & src2_data;
|
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_C3_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
|
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
|
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
|
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
|
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
|
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ;
|
||||
uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ;
|
||||
uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C3_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
|
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
|
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
|
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
|
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
|
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
|
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
|
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
|
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
|
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_0.w : data_0.w;
|
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
|
||||
? tmp_data_1.xy : data_1.xy;
|
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.zw : data_1.zw;
|
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.x : data_2.x;
|
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
|
||||
? tmp_data_2.yzw : data_2.yzw;
|
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0;
|
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1;
|
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C3_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
|
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
|
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
|
||||
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
|
||||
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
|
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
|
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
|
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ;
|
||||
ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ;
|
||||
ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ;
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C3_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 1;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
|
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
|
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
|
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
|
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
short2 src2_data_0 = (short2)(src2.x, src2.y);
|
||||
short2 src2_data_1 = (short2)(src2.z, src2.x);
|
||||
short2 src2_data_2 = (short2)(src2.y, src2.z);
|
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
|
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
|
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
short2 tmp_data_0 = src1_data_0 | src2_data_0 ;
|
||||
short2 tmp_data_1 = src1_data_1 | src2_data_1 ;
|
||||
short2 tmp_data_2 = src1_data_2 | src2_data_2 ;
|
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
|
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
|
||||
? tmp_data_1.x : data_1.x;
|
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_1.y : data_1.y;
|
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
|
||||
? tmp_data_2.xy : data_2.xy;
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C3_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
|
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
|
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
int src2_data_0 = src2.x;
|
||||
int src2_data_1 = src2.y;
|
||||
int src2_data_2 = src2.z;
|
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
|
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
|
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
|
||||
|
||||
int tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
int tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
int tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C3_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
|
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
|
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
|
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
|
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
|
||||
|
||||
char4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
char4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C3_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
|
||||
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
|
||||
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_s_bitwise_or_C4_D0 (
|
||||
__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
uchar4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
|
||||
|
||||
uchar4 data = src_data1 | src2;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C4_D1 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
|
||||
|
||||
char4 data = src_data1 | src2;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C4_D2 (
|
||||
__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
ushort4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
ushort4 data = src_data1 | src2;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C4_D3 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
short4 data = src_data1 | src2;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C4_D4 (
|
||||
__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int4 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
|
||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
|
||||
|
||||
int4 data = src_data1 | src2;
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_s_bitwise_or_C4_D5 (
|
||||
__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
char16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
|
||||
char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
|
||||
char16 src_data2 = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
|
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
char16 data = src_data1 | src_data2;
|
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_s_bitwise_or_C4_D6 (
|
||||
__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
short16 src2, int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
|
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
|
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
|
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
|
||||
short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
|
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
|
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
|
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
|
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
|
||||
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0;
|
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1;
|
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2;
|
||||
short4 tmp_data_3 = src1_data_3 | src2_data_3;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -1,340 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jiang Liyuan, jlyuan001.good@163.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************bitwise_xor without mask**************************************/
|
||||
__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
char4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index));
|
||||
char4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global char4 *)(dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
|
||||
__global ushort *src2, int src2_step, int src2_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
|
||||
ushort4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
|
||||
__global short *src2, int src2_step, int src2_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
x = x << 2;
|
||||
|
||||
#ifdef dst_align
|
||||
#undef dst_align
|
||||
#endif
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
short4 tmp_data = src1_data ^ src2_data;
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
|
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
|
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
|
||||
__global int *src2, int src2_step, int src2_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
int data1 = *((__global int *)((__global char *)src1 + src1_index));
|
||||
int data2 = *((__global int *)((__global char *)src2 + src2_index));
|
||||
int tmp = data1 ^ data2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
|
||||
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
|
||||
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
|
||||
char4 tmp = data1 ^ data2;
|
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
|
||||
}
|
||||
}
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
|
||||
__global char *src2, int src2_step, int src2_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
|
||||
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data1 ^ data2;
|
||||
}
|
||||
}
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -79,15 +79,73 @@
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#endif
|
||||
|
||||
#define THREADS 256
|
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
|
||||
|
||||
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
|
||||
int dst_rows, int dst_cols,
|
||||
int dst_startX, int dst_x_off,
|
||||
float alpha)
|
||||
{
|
||||
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
uint4 tmp_sum = 0;
|
||||
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
|
||||
int posY = (get_group_id(1) << 1);
|
||||
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
tmp_sum += vload4(get_local_id(0), temp+i);
|
||||
}
|
||||
|
||||
if(posY < dst_rows && posX < dst_cols)
|
||||
{
|
||||
tmp_sum /= (uint4) alpha;
|
||||
if(posX >= 0 && posX < dst_cols)
|
||||
*(dst) = tmp_sum.x;
|
||||
if(posX+1 >= 0 && posX+1 < dst_cols)
|
||||
*(dst + 1) = tmp_sum.y;
|
||||
if(posX+2 >= 0 && posX+2 < dst_cols)
|
||||
*(dst + 2) = tmp_sum.z;
|
||||
if(posX+3 >= 0 && posX+3 < dst_cols)
|
||||
*(dst + 3) = tmp_sum.w;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
|
||||
int dst_rows, int dst_cols,
|
||||
int dst_startX, int dst_x_off,
|
||||
float alpha)
|
||||
{
|
||||
if(get_local_id(0) >= (THREADS-ksX+1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
int posX = dst_startX - dst_x_off + get_local_id(0);
|
||||
int posY = (get_group_id(1) << 1);
|
||||
|
||||
uint4 temp_sum = 0;
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
temp_sum += temp[get_local_id(0) + anX + i];
|
||||
}
|
||||
|
||||
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
|
||||
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define THREADS 256
|
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
|
||||
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
|
||||
int col = get_local_id(0);
|
||||
@ -105,115 +163,84 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
|
||||
uint4 data[ksY+1];
|
||||
__local uint4 temp[(THREADS<<1)];
|
||||
__local uint4 temp[2][THREADS];
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
|
||||
{
|
||||
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
|
||||
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
|
||||
else
|
||||
{
|
||||
data[i]=0;
|
||||
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
|
||||
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
|
||||
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
|
||||
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
|
||||
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
|
||||
}
|
||||
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
|
||||
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
|
||||
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
|
||||
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
data[i]=0;
|
||||
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
|
||||
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
|
||||
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
|
||||
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
|
||||
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
int not_all_in_range;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
|
||||
| (startY+i<0) | (startY+i>src_whole_rows-1);
|
||||
if(not_all_in_range)
|
||||
{
|
||||
int selected_row;
|
||||
int4 selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
int not_all_in_range;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
|
||||
| (startY+i<0) | (startY+i>src_whole_rows-1);
|
||||
if(not_all_in_range)
|
||||
{
|
||||
int selected_row;
|
||||
int4 selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
|
||||
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
|
||||
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
|
||||
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
|
||||
|
||||
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
|
||||
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
|
||||
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
|
||||
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
|
||||
|
||||
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
|
||||
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
|
||||
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
|
||||
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
|
||||
|
||||
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
|
||||
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
|
||||
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
|
||||
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
|
||||
|
||||
data[i].x = *(src + selected_row * src_step + selected_col.x);
|
||||
data[i].y = *(src + selected_row * src_step + selected_col.y);
|
||||
data[i].z = *(src + selected_row * src_step + selected_col.z);
|
||||
data[i].w = *(src + selected_row * src_step + selected_col.w);
|
||||
}
|
||||
else
|
||||
{
|
||||
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
|
||||
}
|
||||
}
|
||||
data[i].x = *(src + selected_row * src_step + selected_col.x);
|
||||
data[i].y = *(src + selected_row * src_step + selected_col.y);
|
||||
data[i].z = *(src + selected_row * src_step + selected_col.z);
|
||||
data[i].w = *(src + selected_row * src_step + selected_col.w);
|
||||
}
|
||||
else
|
||||
{
|
||||
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
uint4 sum0 = 0, sum1 = 0, sum2 = 0;
|
||||
uint4 tmp_sum = 0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
{
|
||||
sum0 += (data[i]);
|
||||
tmp_sum += (data[i]);
|
||||
}
|
||||
sum1 = sum0 + (data[0]);
|
||||
sum2 = sum0 + (data[ksY]);
|
||||
|
||||
int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
|
||||
|
||||
temp[col] = sum1;
|
||||
temp[col+THREADS] = sum2;
|
||||
temp[0][col] = tmp_sum + (data[0]);
|
||||
temp[1][col] = tmp_sum + (data[ksY]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(col >= anX && col < (THREADS-ksX+anX+1))
|
||||
{
|
||||
int posX = dst_startX - dst_x_off + (col-anX)*4;
|
||||
int posY = (gY << 1);
|
||||
uint4 tmp_sum1=0, tmp_sum2=0;
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
tmp_sum1 += vload4(col, (__local uint*)temp+i);
|
||||
}
|
||||
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i);
|
||||
}
|
||||
|
||||
if(posY < dst_rows && posX < dst_cols)
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum1.x/alpha;
|
||||
if(posX+1 >= 0 && posX+1 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum1.y/alpha;
|
||||
if(posX+2 >= 0 && posX+2 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha;
|
||||
if(posX+3 >= 0 && posX+3 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha;
|
||||
}
|
||||
if(posY+1 < dst_rows && posX < dst_cols)
|
||||
{
|
||||
dst_startY+=1;
|
||||
if(posX >= 0 && posX < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum2.x/alpha;
|
||||
if(posX+1 >= 0 && posX+1 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum2.y/alpha;
|
||||
if(posX+2 >= 0 && posX+2 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha;
|
||||
if(posX+3 >= 0 && posX+3 < dst_cols)
|
||||
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha;
|
||||
}
|
||||
}
|
||||
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
|
||||
}
|
||||
|
||||
@ -221,9 +248,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
|
||||
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
@ -238,81 +265,63 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
|
||||
|
||||
int end_addr = src_whole_cols-4;
|
||||
uint4 data[ksY+1];
|
||||
__local uint4 temp[2][THREADS];
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
uint4 ss;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
|
||||
//ss = convert_uint4(src[cur_addr]);
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
if(con)
|
||||
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
|
||||
|
||||
data[i] = con ? ss : 0;
|
||||
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
|
||||
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
|
||||
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
|
||||
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
|
||||
}
|
||||
#else
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
|
||||
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
|
||||
}
|
||||
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
|
||||
}
|
||||
|
||||
#endif
|
||||
uint4 sum0 = 0, sum1 = 0, sum2 = 0;
|
||||
uint4 tmp_sum = 0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
{
|
||||
sum0 += (data[i]);
|
||||
tmp_sum += (data[i]);
|
||||
}
|
||||
sum1 = sum0 + (data[0]);
|
||||
sum2 = sum0 + (data[ksY]);
|
||||
temp[0][col] = sum1;
|
||||
temp[1][col] = sum2;
|
||||
|
||||
int index = dst_startY * (dst_step>>2)+ dst_startX + col;
|
||||
|
||||
temp[0][col] = tmp_sum + (data[0]);
|
||||
temp[1][col] = tmp_sum + (data[ksY]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(col < (THREADS-(ksX-1)))
|
||||
{
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
|
||||
uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
for(int i=0; i<2; i++)
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
|
||||
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
@ -327,7 +336,6 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
|
||||
float data[ksY+1];
|
||||
__local float temp[2][THREADS];
|
||||
#ifdef BORDER_CONSTANT
|
||||
@ -336,28 +344,25 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
|
||||
//ss = src[cur_addr];
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
//ss = src[(startY+i)*(src_step>>2) + cur_col];
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
|
||||
|
||||
data[i] = con ? ss : 0.f;
|
||||
}
|
||||
#else
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
data[i] = src[selected_row * (src_step>>2) + selected_col];
|
||||
}
|
||||
data[i] = src[selected_row * (src_step>>2) + selected_col];
|
||||
}
|
||||
|
||||
#endif
|
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
@ -376,7 +381,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
|
||||
float tmp_sum[2]={0.0, 0.0};
|
||||
float tmp_sum[2]= {0.0, 0.0};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
@ -395,9 +400,9 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
|
||||
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
@ -412,7 +417,6 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
|
||||
float4 data[ksY+1];
|
||||
__local float4 temp[2][THREADS];
|
||||
#ifdef BORDER_CONSTANT
|
||||
@ -421,28 +425,25 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
|
||||
//ss = src[cur_addr];
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
//ss = src[(startY+i)*(src_step>>4) + cur_col];
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
|
||||
|
||||
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
|
||||
}
|
||||
#else
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
data[i] = src[selected_row * (src_step>>4) + selected_col];
|
||||
}
|
||||
data[i] = src[selected_row * (src_step>>4) + selected_col];
|
||||
}
|
||||
|
||||
#endif
|
||||
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
@ -461,7 +462,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
|
||||
float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
|
||||
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
|
@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
|
||||
} GpuHidHaarClassifierCascade;
|
||||
|
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
const int splitnode,
|
||||
const int4 p,
|
||||
const int4 pq,
|
||||
const float correction
|
||||
//const int width,
|
||||
//const int height,
|
||||
//const int grpnumperline,
|
||||
//const int totalgrp
|
||||
)
|
||||
const float correction)
|
||||
{
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int lcl_sz = mul24(grpszx,grpszy);
|
||||
int lcl_id = mad24(lclidy,grpszx,lclidx);
|
||||
|
||||
//assume lcl_sz == 256 or 128 or 64
|
||||
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
|
||||
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
|
||||
__local int lclshare[1024];
|
||||
|
||||
#define OFF 0
|
||||
__local int* lcldata = lclshare + OFF;//for save win data
|
||||
__local int* lcldata = lclshare;//for save win data
|
||||
__local int* glboutindex = lcldata + 28*28;//for save global out index
|
||||
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
|
||||
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
|
||||
@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
int imgoff = scaleinfo1.z;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
//int ystep =1;// factor > 2.0 ? 1 : 2;
|
||||
|
||||
__global const int * sum = sum1 + imgoff;
|
||||
__global const float * sqsum = sqsum1 + imgoff;
|
||||
@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int x = mad24(grpidx,grpszx,lclidx);
|
||||
int y = mad24(grpidy,grpszy,lclidy);
|
||||
//candidate_result.x = convert_int_rtn(x*factor);
|
||||
//candidate_result.y = convert_int_rtn(y*factor);
|
||||
int grpoffx = x-lclidx;
|
||||
int grpoffy = y-lclidy;
|
||||
|
||||
@ -207,18 +194,11 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int glb_x = grpoffx + (lcl_x<<2);
|
||||
int glb_y = grpoffy + lcl_y;
|
||||
|
||||
int glb_off = mad24(glb_y,pixelstep,glb_x);
|
||||
int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
|
||||
int4 data = *(__global int4*)&sum[glb_off];
|
||||
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
|
||||
|
||||
#if OFF
|
||||
lcldata[lcl_off] = data.x;
|
||||
lcldata[lcl_off+1] = data.y;
|
||||
lcldata[lcl_off+2] = data.z;
|
||||
lcldata[lcl_off+3] = data.w;
|
||||
#else
|
||||
vstore4(data, 0, &lcldata[lcl_off]);
|
||||
#endif
|
||||
}
|
||||
|
||||
lcloutindex[lcl_id] = 0;
|
||||
@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int lcl_off = mad24(lclidy,readwidth,lclidx);
|
||||
int4 cascadeinfo1, cascadeinfo2;
|
||||
cascadeinfo1 = p;
|
||||
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
|
||||
cascadeinfo2 = pq;
|
||||
|
||||
cascadeinfo1.x +=lcl_off;
|
||||
cascadeinfo1.z +=lcl_off;
|
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
|
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
|
||||
*correction;
|
||||
|
||||
//if((x < width) && (y < height))
|
||||
int p_offset = mad24(y, pixelstep, x);
|
||||
|
||||
cascadeinfo2.x +=p_offset;
|
||||
cascadeinfo2.z +=p_offset;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
|
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
|
||||
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
|
||||
{
|
||||
cascadeinfo1.x +=lcl_off;
|
||||
cascadeinfo1.z +=lcl_off;
|
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
|
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
|
||||
*correction;
|
||||
float stage_sum = 0.f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
|
||||
int p_offset = mad24(y, pixelstep, x);
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
cascadeinfo2.x +=p_offset;
|
||||
cascadeinfo2.z +=p_offset;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
|
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
|
||||
info1.x +=lcl_off;
|
||||
info1.z +=lcl_off;
|
||||
info2.x +=lcl_off;
|
||||
info2.z +=lcl_off;
|
||||
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
//if( cascade->is_stump_based )
|
||||
//{
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
|
||||
info3.x +=lcl_off;
|
||||
info3.z +=lcl_off;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if(result && (x < width) && (y < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
nodecounter = splitnode;
|
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
|
||||
{
|
||||
lclcount[0]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
int perfscale = queuecount > 4 ? 3 : 2;
|
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
|
||||
int lcl_compute_win = lcl_sz >> perfscale;
|
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
|
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
|
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
|
||||
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
|
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
|
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
|
||||
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
int tempnodecounter = lcl_compute_id;
|
||||
float part_sum = 0.f;
|
||||
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
|
||||
info1.x +=lcl_off;
|
||||
info1.z +=lcl_off;
|
||||
info2.x +=lcl_off;
|
||||
info2.z +=lcl_off;
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
info1.x +=queue_pixel;
|
||||
info1.z +=queue_pixel;
|
||||
info2.x +=queue_pixel;
|
||||
info2.z +=queue_pixel;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
|
||||
info3.x +=queue_pixel;
|
||||
info3.z +=queue_pixel;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=lcl_off;
|
||||
info3.z +=lcl_off;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter +=lcl_compute_win;
|
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
|
||||
partialsum[lcl_id]=part_sum;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if(result && (x < width) && (y < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
nodecounter = splitnode;
|
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
|
||||
{
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//if(lcl_id == 0)
|
||||
lclcount[0]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
int perfscale = queuecount > 4 ? 3 : 2;
|
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
|
||||
int lcl_compute_win = lcl_sz >> perfscale;
|
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
|
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
|
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
|
||||
for(int queueloop=0; queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/; queueloop++)
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
|
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
|
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
|
||||
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
|
||||
{
|
||||
|
||||
int tempnodecounter = lcl_compute_id;
|
||||
float part_sum = 0.f;
|
||||
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
info1.x +=queue_pixel;
|
||||
info1.z +=queue_pixel;
|
||||
info2.x +=queue_pixel;
|
||||
info2.z +=queue_pixel;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=queue_pixel;
|
||||
info3.z +=queue_pixel;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter +=lcl_compute_win;
|
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
|
||||
partialsum[lcl_id]=part_sum;
|
||||
stage_sum += partialsum[lcl_id+i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
|
||||
{
|
||||
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
|
||||
{
|
||||
stage_sum += partialsum[lcl_id+i];
|
||||
}
|
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = temp_coord;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
lcl_compute_win_id +=(1<<perfscale);
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = temp_coord;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
queuecount = lclcount[0];
|
||||
lcl_compute_win_id +=(1<<perfscale);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
nodecounter += stageinfo.x;
|
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_id<queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id<<1];
|
||||
int x = mad24(grpidx,grpszx,temp & 0xffff);
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
}
|
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
|
||||
|
||||
queuecount = lclcount[0];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end if((x < width) && (y < height))
|
||||
nodecounter += stageinfo.x;
|
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
|
||||
|
||||
if(lcl_id<queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id<<1];
|
||||
int x = mad24(grpidx,grpszx,temp & 0xffff);
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
//outputoff +=mul24(width,height);
|
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Wu Xinglong, wxl370@126.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -52,11 +53,11 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
|
||||
{
|
||||
struct __attribute__((aligned(32)))
|
||||
{
|
||||
int p0 __attribute__((aligned(4)));
|
||||
int p1 __attribute__((aligned(4)));
|
||||
int p2 __attribute__((aligned(4)));
|
||||
int p3 __attribute__((aligned(4)));
|
||||
float weight __attribute__((aligned(4)));
|
||||
int p0 __attribute__((aligned(4)));
|
||||
int p1 __attribute__((aligned(4)));
|
||||
int p2 __attribute__((aligned(4)));
|
||||
int p3 __attribute__((aligned(4)));
|
||||
float weight __attribute__((aligned(4)));
|
||||
}
|
||||
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
|
||||
}
|
||||
@ -113,173 +114,168 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
global const int *restrict sum,
|
||||
global const float *restrict sqsum,
|
||||
global int4 *candidate,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int step,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
global int4 *p,
|
||||
//const int4 * pq,
|
||||
global float *correction,
|
||||
const int nodecount)
|
||||
{
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx, grpszy);
|
||||
int lcl_id = mad24(lclidy, grpszx, lclidx);
|
||||
__local int lclshare[1024];
|
||||
__local int *glboutindex = lclshare + 0;
|
||||
__local int *lclcount = glboutindex + 1;
|
||||
__local int *lcloutindex = lclcount + 1;
|
||||
__local float *partialsum = (__local float *)(lcloutindex + (lcl_sz << 1));
|
||||
glboutindex[0] = 0;
|
||||
int outputoff = mul24(grpidx, 256);
|
||||
candidate[outputoff + (lcl_id << 2)] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx, grpszy);
|
||||
int lcl_id = mad24(lclidy, grpszx, lclidx);
|
||||
__local int glboutindex[1];
|
||||
__local int lclcount[1];
|
||||
__local int lcloutindex[64];
|
||||
glboutindex[0] = 0;
|
||||
int outputoff = mul24(grpidx, 256);
|
||||
candidate[outputoff + (lcl_id << 2)] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
|
||||
int max_idx = rows * cols - 1;
|
||||
for (int scalei = 0; scalei < loopcount; scalei++)
|
||||
{
|
||||
int4 scaleinfo1;
|
||||
scaleinfo1 = info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t = correction[scalei];
|
||||
int ystep = (int)(max(2.0f, factor) + 0.5f);
|
||||
|
||||
for (int scalei = 0; scalei < loopcount; scalei++)
|
||||
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
|
||||
{
|
||||
int4 scaleinfo1;
|
||||
scaleinfo1 = info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t = correction[scalei];
|
||||
int ystep = (int)(max(2.0f, factor) + 0.5f);
|
||||
int4 cascadeinfo = p[scalei];
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx, grpszx, lclidx);
|
||||
int iy = mad24(grpidy, grpszy, lclidy);
|
||||
int x = ix * ystep;
|
||||
int y = iy * ystep;
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int nodecounter;
|
||||
float mean, variance_norm_factor;
|
||||
//if((ix < width) && (iy < height))
|
||||
{
|
||||
const int p_offset = mad24(y, step, x);
|
||||
cascadeinfo.x += p_offset;
|
||||
cascadeinfo.z += p_offset;
|
||||
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
|
||||
* correction_t;
|
||||
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
|
||||
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
|
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
bool result = true;
|
||||
nodecounter = startnode + nodecount * scalei;
|
||||
|
||||
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
|
||||
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
|
||||
{
|
||||
int4 cascadeinfo = p[scalei];
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx, grpszx, lclidx);
|
||||
int iy = mad24(grpidy, grpszy, lclidy);
|
||||
int x = ix * ystep;
|
||||
int y = iy * ystep;
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int result = 1, nodecounter;
|
||||
float mean, variance_norm_factor;
|
||||
//if((ix < width) && (iy < height))
|
||||
{
|
||||
const int p_offset = mad24(y, step, x);
|
||||
cascadeinfo.x += p_offset;
|
||||
cascadeinfo.z += p_offset;
|
||||
mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
|
||||
sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)])
|
||||
* correction_t;
|
||||
variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
|
||||
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
|
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
result = 1;
|
||||
nodecounter = startnode + nodecount * scalei;
|
||||
|
||||
for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
|
||||
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x += p_offset;
|
||||
info1.z += p_offset;
|
||||
info2.x += p_offset;
|
||||
info2.z += p_offset;
|
||||
float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] -
|
||||
sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x;
|
||||
classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] -
|
||||
sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y;
|
||||
info3.x += p_offset;
|
||||
info3.z += p_offset;
|
||||
classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] -
|
||||
sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z;
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if (result && (ix < width) && (iy < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex << 1] = (y << 16) | x;
|
||||
lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
nodecounter = splitnode + nodecount * scalei;
|
||||
|
||||
if (lcl_id < queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id << 1];
|
||||
int x = temp & 0xffff;
|
||||
int y = (temp & (int)0xffff0000) >> 16;
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
|
||||
candidate_result.x = x;
|
||||
candidate_result.y = y;
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff + temp + lcl_id] = candidate_result;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
float stage_sum = 0.f;
|
||||
int stagecount = stagecascadeptr[stageloop].count;
|
||||
for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
|
||||
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x += p_offset;
|
||||
info1.z += p_offset;
|
||||
info2.x += p_offset;
|
||||
info2.z += p_offset;
|
||||
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
|
||||
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
|
||||
info3.x += p_offset;
|
||||
info3.z += p_offset;
|
||||
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (result && (ix < width) && (iy < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex] = (y << 16) | x;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
|
||||
if (lcl_id < queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id];
|
||||
int x = temp & 0xffff;
|
||||
int y = (temp & (int)0xffff0000) >> 16;
|
||||
temp = atomic_inc(glboutindex);
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
|
||||
candidate_result.x = x;
|
||||
candidate_result.y = y;
|
||||
candidate[outputoff + temp + lcl_id] = candidate_result;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
|
||||
{
|
||||
int counter = get_global_id(0);
|
||||
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
|
||||
GpuHidHaarTreeNode t1 = *(orinode + counter);
|
||||
int counter = get_global_id(0);
|
||||
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
|
||||
GpuHidHaarTreeNode t1 = *(orinode + counter);
|
||||
#pragma unroll
|
||||
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
|
||||
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
|
||||
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
|
||||
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
|
||||
}
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
|
||||
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
|
||||
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
|
||||
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
|
||||
}
|
||||
|
||||
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
|
||||
counter += nodenum;
|
||||
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
|
||||
counter += nodenum;
|
||||
#pragma unroll
|
||||
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
newnode[counter].p[i][0] = tr_x[i];
|
||||
newnode[counter].p[i][1] = tr_y[i];
|
||||
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
|
||||
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
|
||||
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
|
||||
}
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
newnode[counter].p[i][0] = tr_x[i];
|
||||
newnode[counter].p[i][1] = tr_y[i];
|
||||
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
|
||||
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
|
||||
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
|
||||
}
|
||||
|
||||
newnode[counter].left = t1.left;
|
||||
newnode[counter].right = t1.right;
|
||||
newnode[counter].threshold = t1.threshold;
|
||||
newnode[counter].alpha[0] = t1.alpha[0];
|
||||
newnode[counter].alpha[1] = t1.alpha[1];
|
||||
newnode[counter].left = t1.left;
|
||||
newnode[counter].right = t1.right;
|
||||
newnode[counter].threshold = t1.threshold;
|
||||
newnode[counter].alpha[0] = t1.alpha[0];
|
||||
newnode[counter].alpha[1] = t1.alpha[1];
|
||||
}
|
||||
|
||||
|
@ -60,7 +60,7 @@
|
||||
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
|
||||
|
||||
|
||||
kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float *sqsum,
|
||||
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
@ -159,7 +159,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
|
||||
}
|
||||
|
||||
|
||||
kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
|
||||
kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
|
||||
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
|
||||
int sqsum_step,int sum_offset,int sqsum_offset)
|
||||
{
|
||||
@ -275,3 +275,219 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
float4 sqsum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
|
||||
__local float* sum_p;
|
||||
__local float* sqsum_p;
|
||||
src_step = src_step >> 2;
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
|
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
|
||||
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
|
||||
}
|
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum ,
|
||||
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
|
||||
int sqsum_step,int sum_offset,int sqsum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
float4 sqsrc_t[2],sqsum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
|
||||
__local float *sum_p;
|
||||
__local float *sqsum_p;
|
||||
src_step = src_step >> 4;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
|
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
|
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
lm_sqsum[0][bf_loc] = sqsrc_t[0];
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
lm_sqsum[1][bf_loc] = sqsrc_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
|
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
sqsum[sqsum_offset + i + lid] = 0;
|
||||
}
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
int loc1 = gid * 2 * sqsum_step;
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
|
||||
}
|
||||
}
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
lm_sqsum[0][bf_loc] += sqsum_t[0];
|
||||
lm_sqsum[1][bf_loc] += sqsum_t[1];
|
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
|
||||
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 8 + k >= cols) break;
|
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
|
||||
}
|
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
|
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 8 + 4 + k >= cols) break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
@ -44,8 +44,13 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define LSIZE 256
|
||||
#define LSIZE_1 255
|
||||
#define LSIZE_2 254
|
||||
@ -56,8 +61,8 @@
|
||||
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
|
||||
|
||||
|
||||
kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
@ -114,7 +119,8 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
@ -136,9 +142,9 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
|
||||
}
|
||||
|
||||
|
||||
kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
int rows,int cols,int src_step,int sum_step,
|
||||
int sum_offset)
|
||||
kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
|
||||
int rows,int cols,int src_step,int sum_step,
|
||||
int sum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
@ -196,19 +202,20 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
}
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
for(int k = 1;k <= 8;k++)
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(lid > 0 && (i+lid) <= rows){
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
@ -228,3 +235,178 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local float* sum_p;
|
||||
src_step = src_step >> 2;
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
|
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
|
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
|
||||
}
|
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
|
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
|
||||
int rows,int cols,int src_step,int sum_step,
|
||||
int sum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local float *sum_p;
|
||||
src_step = src_step >> 4;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
|
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
|
||||
lm_sum[0][bf_loc] = src_t[0];
|
||||
|
||||
lm_sum[1][bf_loc] = src_t[1];
|
||||
|
||||
int offset = 1;
|
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
}
|
||||
offset <<= 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 2)
|
||||
{
|
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
|
||||
}
|
||||
for(int d = 1; d < LSIZE; d <<= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset >>= 1;
|
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
|
||||
ai += GET_CONFLICT_OFFSET(ai);
|
||||
bi += GET_CONFLICT_OFFSET(bi);
|
||||
|
||||
if((lid & 127) < d)
|
||||
{
|
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
|
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(gid == 0 && (i + lid) <= rows)
|
||||
{
|
||||
sum[sum_offset + i + lid] = 0;
|
||||
}
|
||||
if(i + lid == 0)
|
||||
{
|
||||
int loc0 = gid * 2 * sum_step;
|
||||
for(int k = 1; k <= 8; k++)
|
||||
{
|
||||
if(gid * 8 + k > cols) break;
|
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(lid > 0 && (i+lid) <= rows)
|
||||
{
|
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
|
||||
lm_sum[0][bf_loc] += sum_t[0];
|
||||
lm_sum[1][bf_loc] += sum_t[1];
|
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 8 + k >= cols) break;
|
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
|
||||
}
|
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
|
||||
for(int k = 0; k < 4; k++)
|
||||
{
|
||||
if(gid * 8 + 4 + k >= cols) break;
|
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,48 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
@ -609,22 +654,33 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
|
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel
|
||||
int x = wgidx*TILE_SIZE; // real X index of pixel
|
||||
int kcn = (cn==2)?2:4;
|
||||
int rstep = min(src_step/4, TILE_SIZE);
|
||||
src_step /= sizeof(*src_data);
|
||||
int rstep = min(src_step, TILE_SIZE);
|
||||
tileSize_height = min(TILE_SIZE, src_rows - y);
|
||||
tileSize_width = min(TILE_SIZE, src_cols -x);
|
||||
if(tileSize_width < TILE_SIZE)
|
||||
for(int i = tileSize_width; i < rstep; i++ )
|
||||
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
|
||||
int maxIdx = mul24(src_rows, src_cols);
|
||||
int yOff = (y+lidy)*src_step;
|
||||
int index;
|
||||
if(tileSize_width < TILE_SIZE && yOff < src_rows)
|
||||
for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
|
||||
*(src_data+yOff+x+i) = 0;
|
||||
if( coi > 0 )
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int j=0; j<4; j++)
|
||||
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
|
||||
{
|
||||
index = yOff+(x+i+j)*kcn+coi-1;
|
||||
if (index < maxIdx)
|
||||
tmp_coi[j] = *(src_data+index);
|
||||
else
|
||||
tmp_coi[j] = 0;
|
||||
}
|
||||
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
|
||||
}
|
||||
else
|
||||
for(int i=0; i < tileSize_width; i+=VLEN_F)
|
||||
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
|
||||
for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
|
||||
tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
|
||||
float4 zero = (float4)(0);
|
||||
float4 full = (float4)(255);
|
||||
if( binary )
|
||||
@ -714,35 +770,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
|
||||
// accumulate moments computed in each tile
|
||||
dst_step /= sizeof(F);
|
||||
|
||||
int dst_x_off = mad24(wgidy, dst_cols, wgidx);
|
||||
int dst_off = 0;
|
||||
int max_dst_index = 10 * blocky * get_global_size(1);
|
||||
|
||||
// + m00 ( = m00' )
|
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
|
||||
dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[0];
|
||||
|
||||
// + m10 ( = m10' + x*m00' )
|
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
|
||||
dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[1] + xm;
|
||||
|
||||
// + m01 ( = m01' + y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
|
||||
dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[2] + ym;
|
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
|
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
|
||||
dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
|
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
||||
dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
|
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
|
||||
dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
|
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
|
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
||||
dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
|
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
|
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
||||
dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
|
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
|
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
||||
dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
|
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
|
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
||||
dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
|
||||
if (dst_off < max_dst_index)
|
||||
*(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
// Zhang Chunpeng chunpeng@multicorewareinc.com
|
||||
// Dachuan Zhao, dachuan@multicorewareinc.com
|
||||
// Yao Wang, yao@multicorewareinc.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -47,7 +48,7 @@
|
||||
|
||||
//#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
|
||||
uchar get_valid_uchar(uchar data)
|
||||
uchar get_valid_uchar(float data)
|
||||
{
|
||||
return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
|
||||
}
|
||||
@ -142,7 +143,7 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
|
||||
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
|
||||
|
||||
if ((x < dstCols) && (y < dstRows))
|
||||
dst[x + y * dstStep] = (float)(4.0f * sum);
|
||||
dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum);
|
||||
|
||||
}
|
||||
|
||||
@ -244,7 +245,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
|
||||
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
|
||||
|
||||
if ((x < dstCols) && (y < dstRows))
|
||||
dst[x + y * dstStep] = (float)(4.0f * sum);
|
||||
dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
|
||||
|
||||
}
|
||||
|
||||
@ -351,31 +352,6 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
////////////////////////// CV_8UC4 //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
float4 covert_uchar4_to_float4(uchar4 data)
|
||||
{
|
||||
float4 f4Data = {0,0,0,0};
|
||||
|
||||
f4Data.x = (float)data.x;
|
||||
f4Data.y = (float)data.y;
|
||||
f4Data.z = (float)data.z;
|
||||
f4Data.w = (float)data.w;
|
||||
|
||||
return f4Data;
|
||||
}
|
||||
|
||||
|
||||
uchar4 convert_float4_to_uchar4(float4 data)
|
||||
{
|
||||
uchar4 u4Data;
|
||||
|
||||
u4Data.x = get_valid_uchar(data.x);
|
||||
u4Data.y = get_valid_uchar(data.y);
|
||||
u4Data.z = get_valid_uchar(data.z);
|
||||
u4Data.w = get_valid_uchar(data.w);
|
||||
|
||||
return u4Data;
|
||||
}
|
||||
|
||||
__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
|
||||
int srcRows,int dstRows,int srcCols,int dstCols,
|
||||
int srcOffset,int dstOffset,int srcStep,int dstStep)
|
||||
@ -406,7 +382,7 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
|
||||
srcy = abs(srcy);
|
||||
srcy = min(srcRows -1 ,srcy);
|
||||
|
||||
s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
|
||||
s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -476,38 +452,12 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
|
||||
|
||||
if ((x < dstCols) && (y < dstRows))
|
||||
{
|
||||
dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum);
|
||||
dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum);
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
////////////////////////// CV_16UC4 //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
float4 covert_ushort4_to_float4(ushort4 data)
|
||||
{
|
||||
float4 f4Data = {0,0,0,0};
|
||||
|
||||
f4Data.x = (float)data.x;
|
||||
f4Data.y = (float)data.y;
|
||||
f4Data.z = (float)data.z;
|
||||
f4Data.w = (float)data.w;
|
||||
|
||||
return f4Data;
|
||||
}
|
||||
|
||||
|
||||
ushort4 convert_float4_to_ushort4(float4 data)
|
||||
{
|
||||
ushort4 u4Data;
|
||||
|
||||
u4Data.x = (float)data.x;
|
||||
u4Data.y = (float)data.y;
|
||||
u4Data.z = (float)data.z;
|
||||
u4Data.w = (float)data.w;
|
||||
|
||||
return u4Data;
|
||||
}
|
||||
|
||||
|
||||
__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
|
||||
int srcRows,int dstRows,int srcCols,int dstCols,
|
||||
int srcOffset,int dstOffset,int srcStep,int dstStep)
|
||||
@ -535,7 +485,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
|
||||
srcy = abs(srcy);
|
||||
srcy = min(srcRows -1 ,srcy);
|
||||
|
||||
s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
|
||||
s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -570,11 +520,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
|
||||
|
||||
if (eveny)
|
||||
{
|
||||
sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
|
||||
sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
|
||||
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
|
||||
}
|
||||
|
||||
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
|
||||
@ -610,7 +560,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
|
||||
|
||||
if ((x < dstCols) && (y < dstRows))
|
||||
{
|
||||
dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum);
|
||||
dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum);
|
||||
}
|
||||
}
|
||||
|
||||
@ -681,11 +631,11 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
|
||||
|
||||
if (eveny)
|
||||
{
|
||||
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)];
|
||||
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
|
||||
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
|
||||
}
|
||||
|
||||
s_dstPatch[tidy][tidx] = sum;
|
||||
|
@ -16,6 +16,8 @@
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -50,59 +52,40 @@
|
||||
#define STEREO_MIND 0 // The minimum d range to check
|
||||
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
|
||||
|
||||
int SQ(int a)
|
||||
{
|
||||
return a * a;
|
||||
}
|
||||
#ifndef radius
|
||||
#define radius 64
|
||||
#endif
|
||||
|
||||
unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
volatile __local unsigned int *col_ssd, int radius)
|
||||
unsigned int CalcSSD(__local unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int cache = 0;
|
||||
unsigned int cache2 = 0;
|
||||
unsigned int cache = col_ssd[0];
|
||||
|
||||
for(int i = 1; i <= radius; i++)
|
||||
#pragma unroll
|
||||
for(int i = 1; i <= (radius << 1); i++)
|
||||
cache += col_ssd[i];
|
||||
|
||||
col_ssd_cache[0] = cache;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_local_id(0) < BLOCK_W - radius)
|
||||
cache2 = col_ssd_cache[radius];
|
||||
else
|
||||
for(int i = radius + 1; i < (2 * radius + 1); i++)
|
||||
cache2 += col_ssd[i];
|
||||
|
||||
return col_ssd[0] + cache + cache2;
|
||||
return cache;
|
||||
}
|
||||
|
||||
uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
volatile __local unsigned int *col_ssd, int radius)
|
||||
uint2 MinSSD(__local unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int ssd[N_DISPARITIES];
|
||||
const int win_size = (radius << 1);
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
|
||||
ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
|
||||
ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + win_size));
|
||||
ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + win_size));
|
||||
ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + win_size));
|
||||
ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + win_size));
|
||||
ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + win_size));
|
||||
ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + win_size));
|
||||
ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + win_size));
|
||||
ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + win_size));
|
||||
|
||||
unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
|
||||
|
||||
int bestIdx = 0;
|
||||
|
||||
for (int i = 0; i < N_DISPARITIES; i++)
|
||||
{
|
||||
if (mssd == ssd[i])
|
||||
@ -113,124 +96,66 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
|
||||
}
|
||||
|
||||
void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius)
|
||||
__global unsigned char* imageR, int d, __local unsigned int *col_ssd)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
unsigned char leftPixel2;
|
||||
unsigned char rightPixel1[8];
|
||||
unsigned char rightPixel2[8];
|
||||
unsigned int diff1, diff2;
|
||||
|
||||
leftPixel1 = imageL[idx1];
|
||||
leftPixel2 = imageL[idx2];
|
||||
|
||||
idx1 = idx1 - d;
|
||||
idx2 = idx2 - d;
|
||||
|
||||
rightPixel1[7] = imageR[idx1 - 7];
|
||||
rightPixel1[0] = imageR[idx1 - 0];
|
||||
rightPixel1[1] = imageR[idx1 - 1];
|
||||
rightPixel1[2] = imageR[idx1 - 2];
|
||||
rightPixel1[3] = imageR[idx1 - 3];
|
||||
rightPixel1[4] = imageR[idx1 - 4];
|
||||
rightPixel1[5] = imageR[idx1 - 5];
|
||||
rightPixel1[6] = imageR[idx1 - 6];
|
||||
|
||||
rightPixel2[7] = imageR[idx2 - 7];
|
||||
rightPixel2[0] = imageR[idx2 - 0];
|
||||
rightPixel2[1] = imageR[idx2 - 1];
|
||||
rightPixel2[2] = imageR[idx2 - 2];
|
||||
rightPixel2[3] = imageR[idx2 - 3];
|
||||
rightPixel2[4] = imageR[idx2 - 4];
|
||||
rightPixel2[5] = imageR[idx2 - 5];
|
||||
rightPixel2[6] = imageR[idx2 - 6];
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
|
||||
diff1 = leftPixel1 - rightPixel1[0];
|
||||
diff2 = leftPixel2 - rightPixel2[0];
|
||||
col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[1];
|
||||
diff2 = leftPixel2 - rightPixel2[1];
|
||||
col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[2];
|
||||
diff2 = leftPixel2 - rightPixel2[2];
|
||||
col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[3];
|
||||
diff2 = leftPixel2 - rightPixel2[3];
|
||||
col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[4];
|
||||
diff2 = leftPixel2 - rightPixel2[4];
|
||||
col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[5];
|
||||
diff2 = leftPixel2 - rightPixel2[5];
|
||||
col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[6];
|
||||
diff2 = leftPixel2 - rightPixel2[6];
|
||||
col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[7];
|
||||
diff2 = leftPixel2 - rightPixel2[7];
|
||||
col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
|
||||
uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
|
||||
uint8 imgR2 = convert_uint8(vload8(0, imageR + (idx2 - d - 7)));
|
||||
uint8 diff1 = (uint8)(imageL[idx1]) - imgR1;
|
||||
uint8 diff2 = (uint8)(imageL[idx2]) - imgR2;
|
||||
uint8 res = diff2 * diff2 - diff1 * diff1;
|
||||
const int win_size = (radius << 1);
|
||||
col_ssd[0 * (BLOCK_W + win_size)] += res.s7;
|
||||
col_ssd[1 * (BLOCK_W + win_size)] += res.s6;
|
||||
col_ssd[2 * (BLOCK_W + win_size)] += res.s5;
|
||||
col_ssd[3 * (BLOCK_W + win_size)] += res.s4;
|
||||
col_ssd[4 * (BLOCK_W + win_size)] += res.s3;
|
||||
col_ssd[5 * (BLOCK_W + win_size)] += res.s2;
|
||||
col_ssd[6 * (BLOCK_W + win_size)] += res.s1;
|
||||
col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
|
||||
}
|
||||
|
||||
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d,
|
||||
volatile __local unsigned int *col_ssd, int radius)
|
||||
__local unsigned int *col_ssd)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
int idx;
|
||||
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
for(int i = 0; i < (2 * radius + 1); i++)
|
||||
uint8 leftPixel1;
|
||||
uint8 diffa = 0;
|
||||
int idx = y_tex * im_pitch + x_tex;
|
||||
const int win_size = (radius << 1);
|
||||
for(int i = 0; i < (win_size + 1); i++)
|
||||
{
|
||||
idx = y_tex * im_pitch + x_tex;
|
||||
leftPixel1 = imageL[idx];
|
||||
idx = idx - d;
|
||||
leftPixel1 = (uint8)(imageL[idx]);
|
||||
uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7)));
|
||||
uint8 res = leftPixel1 - imgR;
|
||||
diffa += res * res;
|
||||
|
||||
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
|
||||
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
|
||||
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
|
||||
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
|
||||
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
|
||||
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
|
||||
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
|
||||
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
|
||||
|
||||
y_tex += 1;
|
||||
idx += im_pitch;
|
||||
}
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
|
||||
col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0];
|
||||
col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1];
|
||||
col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2];
|
||||
col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3];
|
||||
col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4];
|
||||
col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5];
|
||||
col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6];
|
||||
col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
|
||||
col_ssd[0 * (BLOCK_W + win_size)] = diffa.s7;
|
||||
col_ssd[1 * (BLOCK_W + win_size)] = diffa.s6;
|
||||
col_ssd[2 * (BLOCK_W + win_size)] = diffa.s5;
|
||||
col_ssd[3 * (BLOCK_W + win_size)] = diffa.s4;
|
||||
col_ssd[4 * (BLOCK_W + win_size)] = diffa.s3;
|
||||
col_ssd[5 * (BLOCK_W + win_size)] = diffa.s2;
|
||||
col_ssd[6 * (BLOCK_W + win_size)] = diffa.s1;
|
||||
col_ssd[7 * (BLOCK_W + win_size)] = diffa.s0;
|
||||
}
|
||||
|
||||
__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
|
||||
__global unsigned int *cminSSDImage, int cminSSD_step,
|
||||
__global unsigned char *disp, int disp_step,int cwidth, int cheight,
|
||||
int img_step, int maxdisp, int radius,
|
||||
int img_step, int maxdisp,
|
||||
__local unsigned int *col_ssd_cache)
|
||||
{
|
||||
|
||||
volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
|
||||
volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
|
||||
__local unsigned int *col_ssd = col_ssd_cache + get_local_id(0);
|
||||
__local unsigned int *col_ssd_extra = get_local_id(0) < (radius << 1) ? col_ssd + BLOCK_W : 0;
|
||||
|
||||
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
|
||||
// int Y = get_group_id(1) * ROWSperTHREAD + radius;
|
||||
|
||||
#define Y (get_group_id(1) * ROWSperTHREAD + radius)
|
||||
|
||||
volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
|
||||
__global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
|
||||
__global unsigned char* disparImage = disp + X + Y * disp_step;
|
||||
|
||||
int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
|
||||
@ -244,14 +169,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
{
|
||||
y_tex = Y - radius;
|
||||
|
||||
InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius);
|
||||
InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd);
|
||||
if (col_ssd_extra > 0)
|
||||
if (x_tex + BLOCK_W < cwidth)
|
||||
InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius);
|
||||
InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
|
||||
|
||||
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
|
||||
uint2 minSSD = MinSSD(col_ssd);
|
||||
if (X < cwidth - radius && Y < cheight - radius)
|
||||
{
|
||||
if (minSSD.x < minSSDImage[0])
|
||||
@ -264,21 +189,18 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
for(int row = 1; row < end_row; row++)
|
||||
{
|
||||
int idx1 = y_tex * img_step + x_tex;
|
||||
int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex;
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
int idx2 = min(y_tex + ((radius << 1) + 1), cheight - 1) * img_step + x_tex;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
StepDown(idx1, idx2, left, right, d, col_ssd, radius);
|
||||
StepDown(idx1, idx2, left, right, d, col_ssd);
|
||||
if (col_ssd_extra > 0)
|
||||
if (x_tex + BLOCK_W < cwidth)
|
||||
StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius);
|
||||
|
||||
y_tex += 1;
|
||||
StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
|
||||
uint2 minSSD = MinSSD(col_ssd);
|
||||
if (X < cwidth - radius && row < cheight - radius - Y)
|
||||
{
|
||||
int idx = row * cminSSD_step;
|
||||
@ -288,10 +210,11 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
minSSDImage[idx] = minSSD.x;
|
||||
}
|
||||
}
|
||||
|
||||
y_tex++;
|
||||
} // for row loop
|
||||
} // for d loop
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -185,10 +185,10 @@ __kernel void data_step_down(__global T *src, int src_rows,
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
{
|
||||
float dst_reg;
|
||||
dst_reg = src[(d * src_rows + (2*y+0)) * src_step + 2*x+0];
|
||||
dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+0];
|
||||
dst_reg += src[(d * src_rows + (2*y+0)) * src_step + 2*x+1];
|
||||
dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+1];
|
||||
dst_reg = src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+0];
|
||||
dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+0];
|
||||
dst_reg += src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+1];
|
||||
dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+1];
|
||||
|
||||
dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
|
||||
}
|
||||
|
1402
modules/ocl/src/opencl/stereocsbp.cl
Normal file
1402
modules/ocl/src/opencl/stereocsbp.cl
Normal file
File diff suppressed because it is too large
Load Diff
407
modules/ocl/src/opencl/tvl1flow.cl
Normal file
407
modules/ocl/src/opencl/tvl1flow.cl
Normal file
@ -0,0 +1,407 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jin Ma jin@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel void centeredGradientKernel(__global const float* src, int src_col, int src_row, int src_step,
|
||||
__global float* dx, __global float* dy, int dx_step)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if((x < src_col)&&(y < src_row))
|
||||
{
|
||||
int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
|
||||
int src_x2 = (x - 1) > 0 ? (x -1) : 0;
|
||||
|
||||
//if(src[y * src_step + src_x1] == src[y * src_step+ src_x2])
|
||||
//{
|
||||
// printf("y = %d\n", y);
|
||||
// printf("src_x1 = %d\n", src_x1);
|
||||
// printf("src_x2 = %d\n", src_x2);
|
||||
//}
|
||||
dx[y * dx_step+ x] = 0.5f * (src[y * src_step + src_x1] - src[y * src_step+ src_x2]);
|
||||
|
||||
int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
|
||||
int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
|
||||
dy[y * dx_step+ x] = 0.5f * (src[src_y1 * src_step + x] - src[src_y2 * src_step+ x]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
float bicubicCoeff(float x_)
|
||||
{
|
||||
|
||||
float x = fabs(x_);
|
||||
if (x <= 1.0f)
|
||||
{
|
||||
return x * x * (1.5f * x - 2.5f) + 1.0f;
|
||||
}
|
||||
else if (x < 2.0f)
|
||||
{
|
||||
return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
|
||||
image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,
|
||||
__global const float* u1, int u1_step,
|
||||
__global const float* u2,
|
||||
__global float* I1w,
|
||||
__global float* I1wx, /*int I1wx_step,*/
|
||||
__global float* I1wy, /*int I1wy_step,*/
|
||||
__global float* grad, /*int grad_step,*/
|
||||
__global float* rho,
|
||||
int I1w_step,
|
||||
int u2_step,
|
||||
int u1_offset_x,
|
||||
int u1_offset_y,
|
||||
int u2_offset_x,
|
||||
int u2_offset_y)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if(x < I0_col&&y < I0_row)
|
||||
{
|
||||
//const float u1Val = u1(y, x);
|
||||
const float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
|
||||
//const float u2Val = u2(y, x);
|
||||
const float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
|
||||
|
||||
const float wx = x + u1Val;
|
||||
const float wy = y + u2Val;
|
||||
|
||||
const int xmin = ceil(wx - 2.0f);
|
||||
const int xmax = floor(wx + 2.0f);
|
||||
|
||||
const int ymin = ceil(wy - 2.0f);
|
||||
const int ymax = floor(wy + 2.0f);
|
||||
|
||||
float sum = 0.0f;
|
||||
float sumx = 0.0f;
|
||||
float sumy = 0.0f;
|
||||
float wsum = 0.0f;
|
||||
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
for (int cy = ymin; cy <= ymax; ++cy)
|
||||
{
|
||||
for (int cx = xmin; cx <= xmax; ++cx)
|
||||
{
|
||||
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
|
||||
|
||||
//sum += w * tex2D(tex_I1 , cx, cy);
|
||||
int2 cood = (int2)(cx, cy);
|
||||
sum += w * read_imagef(tex_I1, sampleri, cood).x;
|
||||
//sumx += w * tex2D(tex_I1x, cx, cy);
|
||||
sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
|
||||
//sumy += w * tex2D(tex_I1y, cx, cy);
|
||||
sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
|
||||
|
||||
wsum += w;
|
||||
}
|
||||
}
|
||||
|
||||
const float coeff = 1.0f / wsum;
|
||||
|
||||
const float I1wVal = sum * coeff;
|
||||
const float I1wxVal = sumx * coeff;
|
||||
const float I1wyVal = sumy * coeff;
|
||||
|
||||
I1w[y * I1w_step + x] = I1wVal;
|
||||
I1wx[y * I1w_step + x] = I1wxVal;
|
||||
I1wy[y * I1w_step + x] = I1wyVal;
|
||||
|
||||
const float Ix2 = I1wxVal * I1wxVal;
|
||||
const float Iy2 = I1wyVal * I1wyVal;
|
||||
|
||||
// store the |Grad(I1)|^2
|
||||
grad[y * I1w_step + x] = Ix2 + Iy2;
|
||||
|
||||
// compute the constant part of the rho function
|
||||
const float I0Val = I0[y * I0_step + x];
|
||||
rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
float readImage(__global const float *image, const int x, const int y, const int rows, const int cols, const int elemCntPerRow)
|
||||
{
|
||||
int i0 = clamp(x, 0, cols - 1);
|
||||
int j0 = clamp(y, 0, rows - 1);
|
||||
int i1 = clamp(x + 1, 0, cols - 1);
|
||||
int j1 = clamp(y + 1, 0, rows - 1);
|
||||
|
||||
return image[j0 * elemCntPerRow + i0];
|
||||
}
|
||||
|
||||
__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
|
||||
__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,
|
||||
__global const float* u1, int u1_step,
|
||||
__global const float* u2,
|
||||
__global float* I1w,
|
||||
__global float* I1wx, /*int I1wx_step,*/
|
||||
__global float* I1wy, /*int I1wy_step,*/
|
||||
__global float* grad, /*int grad_step,*/
|
||||
__global float* rho,
|
||||
int I1w_step,
|
||||
int u2_step,
|
||||
int I1_step,
|
||||
int I1x_step)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if(x < I0_col&&y < I0_row)
|
||||
{
|
||||
//const float u1Val = u1(y, x);
|
||||
const float u1Val = u1[y * u1_step + x];
|
||||
//const float u2Val = u2(y, x);
|
||||
const float u2Val = u2[y * u2_step + x];
|
||||
|
||||
const float wx = x + u1Val;
|
||||
const float wy = y + u2Val;
|
||||
|
||||
const int xmin = ceil(wx - 2.0f);
|
||||
const int xmax = floor(wx + 2.0f);
|
||||
|
||||
const int ymin = ceil(wy - 2.0f);
|
||||
const int ymax = floor(wy + 2.0f);
|
||||
|
||||
float sum = 0.0f;
|
||||
float sumx = 0.0f;
|
||||
float sumy = 0.0f;
|
||||
float wsum = 0.0f;
|
||||
|
||||
for (int cy = ymin; cy <= ymax; ++cy)
|
||||
{
|
||||
for (int cx = xmin; cx <= xmax; ++cx)
|
||||
{
|
||||
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
|
||||
|
||||
int2 cood = (int2)(cx, cy);
|
||||
sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
|
||||
sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
|
||||
sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
|
||||
wsum += w;
|
||||
}
|
||||
}
|
||||
|
||||
const float coeff = 1.0f / wsum;
|
||||
|
||||
const float I1wVal = sum * coeff;
|
||||
const float I1wxVal = sumx * coeff;
|
||||
const float I1wyVal = sumy * coeff;
|
||||
|
||||
I1w[y * I1w_step + x] = I1wVal;
|
||||
I1wx[y * I1w_step + x] = I1wxVal;
|
||||
I1wy[y * I1w_step + x] = I1wyVal;
|
||||
|
||||
const float Ix2 = I1wxVal * I1wxVal;
|
||||
const float Iy2 = I1wyVal * I1wyVal;
|
||||
|
||||
// store the |Grad(I1)|^2
|
||||
grad[y * I1w_step + x] = Ix2 + Iy2;
|
||||
|
||||
// compute the constant part of the rho function
|
||||
const float I0Val = I0[y * I0_step + x];
|
||||
rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,
|
||||
__global const float* u2,
|
||||
__global float* p11, int p11_step,
|
||||
__global float* p12,
|
||||
__global float* p21,
|
||||
__global float* p22,
|
||||
const float taut,
|
||||
int u2_step,
|
||||
int u1_offset_x,
|
||||
int u1_offset_y,
|
||||
int u2_offset_x,
|
||||
int u2_offset_y)
|
||||
{
|
||||
|
||||
//const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
//const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = get_global_id(0);
|
||||
const int y = get_global_id(1);
|
||||
|
||||
if(x < u1_col && y < u1_row)
|
||||
{
|
||||
int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
|
||||
const float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
|
||||
|
||||
int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
|
||||
const float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
|
||||
|
||||
int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
|
||||
const float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
|
||||
|
||||
int src_y2 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
|
||||
const float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
|
||||
|
||||
const float g1 = hypot(u1x, u1y);
|
||||
const float g2 = hypot(u2x, u2y);
|
||||
|
||||
const float ng1 = 1.0f + taut * g1;
|
||||
const float ng2 = 1.0f + taut * g2;
|
||||
|
||||
p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
|
||||
p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
|
||||
p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
|
||||
p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
|
||||
{
|
||||
|
||||
if (x > 0 && y > 0)
|
||||
{
|
||||
const float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
|
||||
const float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
|
||||
return v1x + v2y;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (y > 0)
|
||||
return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
|
||||
else
|
||||
{
|
||||
if (x > 0)
|
||||
return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
|
||||
else
|
||||
return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
|
||||
__global const float* I1wy, /*int I1wy_step,*/
|
||||
__global const float* grad, /*int grad_step,*/
|
||||
__global const float* rho_c, /*int rho_c_step,*/
|
||||
__global const float* p11, /*int p11_step,*/
|
||||
__global const float* p12, /*int p12_step,*/
|
||||
__global const float* p21, /*int p21_step,*/
|
||||
__global const float* p22, /*int p22_step,*/
|
||||
__global float* u1, int u1_step,
|
||||
__global float* u2,
|
||||
__global float* error, const float l_t, const float theta, int u2_step,
|
||||
int u1_offset_x,
|
||||
int u1_offset_y,
|
||||
int u2_offset_x,
|
||||
int u2_offset_y)
|
||||
{
|
||||
|
||||
//const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
//const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
|
||||
if(x < I1wx_col && y < I1wx_row)
|
||||
{
|
||||
const float I1wxVal = I1wx[y * I1wx_step + x];
|
||||
const float I1wyVal = I1wy[y * I1wx_step + x];
|
||||
const float gradVal = grad[y * I1wx_step + x];
|
||||
const float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
|
||||
const float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
|
||||
|
||||
const float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
|
||||
|
||||
// estimate the values of the variable (v1, v2) (thresholding operator TH)
|
||||
|
||||
float d1 = 0.0f;
|
||||
float d2 = 0.0f;
|
||||
|
||||
if (rho < -l_t * gradVal)
|
||||
{
|
||||
d1 = l_t * I1wxVal;
|
||||
d2 = l_t * I1wyVal;
|
||||
}
|
||||
else if (rho > l_t * gradVal)
|
||||
{
|
||||
d1 = -l_t * I1wxVal;
|
||||
d2 = -l_t * I1wyVal;
|
||||
}
|
||||
else if (gradVal > 1.192092896e-07f)
|
||||
{
|
||||
const float fi = -rho / gradVal;
|
||||
d1 = fi * I1wxVal;
|
||||
d2 = fi * I1wyVal;
|
||||
}
|
||||
|
||||
const float v1 = u1OldVal + d1;
|
||||
const float v2 = u2OldVal + d2;
|
||||
|
||||
// compute the divergence of the dual variable (p1, p2)
|
||||
|
||||
const float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
|
||||
const float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
|
||||
|
||||
// estimate the values of the optical flow (u1, u2)
|
||||
|
||||
const float u1NewVal = v1 + theta * div_p1;
|
||||
const float u2NewVal = v2 + theta * div_p2;
|
||||
|
||||
u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
|
||||
u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
|
||||
|
||||
const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
|
||||
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
|
||||
error[y * I1wx_step + x] = n1 + n2;
|
||||
}
|
||||
|
||||
}
|
756
modules/ocl/src/stereo_csbp.cpp
Normal file
756
modules/ocl/src/stereo_csbp.cpp
Normal file
@ -0,0 +1,756 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Jin Ma, jin@multicorewareinc.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
using namespace std;
|
||||
|
||||
#if !defined (HAVE_OPENCL)
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &)
|
||||
{
|
||||
throw_nogpu();
|
||||
}
|
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int)
|
||||
{
|
||||
throw_nogpu();
|
||||
}
|
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float,
|
||||
float, float, int, int)
|
||||
{
|
||||
throw_nogpu();
|
||||
}
|
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &)
|
||||
{
|
||||
throw_nogpu();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else /* !defined (HAVE_OPENCL) */
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
|
||||
///////////////////////////OpenCL kernel Strings///////////////////////////
|
||||
extern const char *stereocsbp;
|
||||
}
|
||||
|
||||
}
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
namespace stereoCSBP
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////common////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static inline int divUp(int total, int grain)
|
||||
{
|
||||
return (total + grain - 1) / grain;
|
||||
}
|
||||
static String get_kernel_name(String kernel_name, int data_type)
|
||||
{
|
||||
return kernel_name + (data_type == CV_16S ? "0" : "1");
|
||||
}
|
||||
using cv::ocl::StereoConstantSpaceBP;
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////init_data_cost//////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp,
|
||||
StereoConstantSpaceBP &rthis,
|
||||
int msg_step, int h, int w, int level)
|
||||
{
|
||||
Context *clCxt = left.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
int channels = left.oclchannels();
|
||||
|
||||
String kernelName = get_kernel_name("init_data_cost_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8 ,1};
|
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
|
||||
divUp(h, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int cdisp_step1 = msg_step * h;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&channels));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&msg_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
|
||||
static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp,
|
||||
StereoConstantSpaceBP &rthis,
|
||||
int msg_step, int h, int w, int level)
|
||||
{
|
||||
|
||||
Context *clCxt = left.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
int channels = left.oclchannels();
|
||||
int win_size = (int)std::pow(2.f, level);
|
||||
|
||||
String kernelName = get_kernel_name("init_data_cost_reduce_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
const int threadsNum = 256;
|
||||
//size_t blockSize = threadsNum;
|
||||
size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
|
||||
size_t globalThreads[3] = {w *localThreads[0],
|
||||
h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2]
|
||||
};
|
||||
|
||||
int local_mem_size = threadsNum * sizeof(float);
|
||||
int cdisp_step1 = msg_step * h;
|
||||
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, local_mem_size, (void *)NULL));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&level));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&win_size));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&channels));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&rthis.ndisp));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&left.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
|
||||
static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
|
||||
oclMat &temp, StereoConstantSpaceBP &rthis,
|
||||
int h, int w, int nr_plane, int msg_step)
|
||||
{
|
||||
Context *clCxt = temp.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
|
||||
String kernelName = get_kernel_name("get_first_k_initial_local_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8 ,1};
|
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
|
||||
divUp(h, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int disp_step = msg_step * h;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
|
||||
oclMat &temp, StereoConstantSpaceBP &rthis,
|
||||
int h, int w, int nr_plane, int msg_step)
|
||||
{
|
||||
Context *clCxt = temp.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
|
||||
String kernelName = get_kernel_name("get_first_k_initial_global_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1};
|
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
|
||||
divUp(h, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int disp_step = msg_step * h;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
|
||||
static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis,
|
||||
uchar *disp_selected_pyr, uchar *data_cost_selected,
|
||||
size_t msg_step, int h, int w, int level, int nr_plane)
|
||||
{
|
||||
|
||||
if(level <= 1)
|
||||
init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level);
|
||||
else
|
||||
init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level);
|
||||
|
||||
if(rthis.use_local_init_data_cost == true)
|
||||
{
|
||||
get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step);
|
||||
}
|
||||
else
|
||||
{
|
||||
get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w,
|
||||
nr_plane, msg_step);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////compute_data_cost//////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost,
|
||||
StereoConstantSpaceBP &rthis, int msg_step1,
|
||||
int msg_step2, const oclMat &left, const oclMat &right, int h,
|
||||
int w, int h2, int level, int nr_plane)
|
||||
{
|
||||
Context *clCxt = left.clCxt;
|
||||
int channels = left.oclchannels();
|
||||
int data_type = rthis.msg_type;
|
||||
|
||||
String kernelName = get_kernel_name("compute_data_cost_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1};
|
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
|
||||
divUp(h, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int disp_step1 = msg_step1 * h;
|
||||
int disp_step2 = msg_step2 * h2;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&w));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&level));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&channels));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&msg_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&msg_step2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&disp_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost,
|
||||
StereoConstantSpaceBP &rthis, int msg_step1,
|
||||
int msg_step2, const oclMat &left, const oclMat &right, int h,
|
||||
int w, int h2, int level, int nr_plane)
|
||||
{
|
||||
Context *clCxt = left.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
int channels = left.oclchannels();
|
||||
int win_size = (int)std::pow(2.f, level);
|
||||
|
||||
String kernelName = get_kernel_name("compute_data_cost_reduce_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
const size_t threadsNum = 256;
|
||||
//size_t blockSize = threadsNum;
|
||||
size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
|
||||
size_t globalThreads[3] = {w *localThreads[0],
|
||||
h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2]
|
||||
};
|
||||
|
||||
int disp_step1 = msg_step1 * h;
|
||||
int disp_step2 = msg_step2 * h2;
|
||||
size_t local_mem_size = threadsNum * sizeof(float);
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size, (void *)NULL));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&channels));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&win_size));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&msg_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&disp_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&disp_step2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis,
|
||||
int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w,
|
||||
int h2, int level, int nr_plane)
|
||||
{
|
||||
if(level <= 1)
|
||||
compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
|
||||
left, right, h, w, h2, level, nr_plane);
|
||||
else
|
||||
compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
|
||||
left, right, h, w, h2, level, nr_plane);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////init message//////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new,
|
||||
uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur,
|
||||
uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur,
|
||||
uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis,
|
||||
size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane,
|
||||
int h2, int w2, int nr_plane2)
|
||||
{
|
||||
Context *clCxt = temp.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
|
||||
String kernelName = get_kernel_name("init_message_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1};
|
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
|
||||
divUp(h, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int disp_step1 = msg_step1 * h;
|
||||
int disp_step2 = msg_step2 * h2;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u_new));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_new));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l_new));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r_new));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&u_cur));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_cur));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&l_cur));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&r_cur));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&temp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&disp_selected_pyr_new));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////calc_all_iterations////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
|
||||
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
|
||||
int msg_step, int h, int w, int nr_plane, int i)
|
||||
{
|
||||
Context *clCxt = temp.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
|
||||
String kernelName = get_kernel_name("compute_message_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
size_t localThreads[] = {32, 8, 1};
|
||||
size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0],
|
||||
divUp(h, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int disp_step = msg_step * h;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&temp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&w));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&i));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
|
||||
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
|
||||
int msg_step, int h, int w, int nr_plane)
|
||||
{
|
||||
for(int t = 0; t < rthis.iters; t++)
|
||||
calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis,
|
||||
msg_step, h, w, nr_plane, t & 1);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////compute_disp////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
|
||||
uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step,
|
||||
oclMat &disp, int nr_plane)
|
||||
{
|
||||
Context *clCxt = disp.clCxt;
|
||||
int data_type = rthis.msg_type;
|
||||
|
||||
String kernelName = get_kernel_name("compute_disp_", data_type);
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
|
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1};
|
||||
size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0],
|
||||
divUp(disp.rows, localThreads[1]) *localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int step_size = disp.step / disp.elemSize();
|
||||
int disp_step = disp.rows * msg_step;
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&disp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&step_size));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&disp.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&disp.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
namespace
|
||||
{
|
||||
const float DEFAULT_MAX_DATA_TERM = 30.0f;
|
||||
const float DEFAULT_DATA_WEIGHT = 1.0f;
|
||||
const float DEFAULT_MAX_DISC_TERM = 160.0f;
|
||||
const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
|
||||
}
|
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane)
|
||||
{
|
||||
ndisp = (int) ((float) width / 3.14f);
|
||||
if ((ndisp & 1) != 0)
|
||||
ndisp++;
|
||||
|
||||
int mm = ::max(width, height);
|
||||
iters = mm / 100 + ((mm > 1200) ? - 4 : 4);
|
||||
|
||||
levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
|
||||
if (levels == 0) levels++;
|
||||
|
||||
nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
|
||||
}
|
||||
|
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
|
||||
int msg_type_)
|
||||
|
||||
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
|
||||
max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
|
||||
max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
|
||||
msg_type(msg_type_), use_local_init_data_cost(true)
|
||||
{
|
||||
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
|
||||
}
|
||||
|
||||
|
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
|
||||
float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
|
||||
int min_disp_th_, int msg_type_)
|
||||
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
|
||||
max_data_term(max_data_term_), data_weight(data_weight_),
|
||||
max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
|
||||
msg_type(msg_type_), use_local_init_data_cost(true)
|
||||
{
|
||||
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
|
||||
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
|
||||
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp)
|
||||
{
|
||||
CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
|
||||
&& left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
|
||||
|
||||
CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3));
|
||||
|
||||
const Scalar zero = Scalar::all(0);
|
||||
|
||||
////////////////////////////////////Init///////////////////////////////////////////////////
|
||||
int rows = left.rows;
|
||||
int cols = left.cols;
|
||||
|
||||
rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
|
||||
int levels = rthis.levels;
|
||||
|
||||
AutoBuffer<int> buf(levels * 4);
|
||||
|
||||
int *cols_pyr = buf;
|
||||
int *rows_pyr = cols_pyr + levels;
|
||||
int *nr_plane_pyr = rows_pyr + levels;
|
||||
int *step_pyr = nr_plane_pyr + levels;
|
||||
|
||||
cols_pyr[0] = cols;
|
||||
rows_pyr[0] = rows;
|
||||
nr_plane_pyr[0] = rthis.nr_plane;
|
||||
|
||||
const int n = 64;
|
||||
step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T);
|
||||
for (int i = 1; i < levels; i++)
|
||||
{
|
||||
cols_pyr[i] = cols_pyr[i - 1] / 2;
|
||||
rows_pyr[i] = rows_pyr[i - 1]/ 2;
|
||||
|
||||
nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2;
|
||||
|
||||
step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T);
|
||||
}
|
||||
|
||||
Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
|
||||
Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);
|
||||
|
||||
u[0].create(msg_size, DataType<T>::type);
|
||||
d[0].create(msg_size, DataType<T>::type);
|
||||
l[0].create(msg_size, DataType<T>::type);
|
||||
r[0].create(msg_size, DataType<T>::type);
|
||||
|
||||
u[1].create(msg_size, DataType<T>::type);
|
||||
d[1].create(msg_size, DataType<T>::type);
|
||||
l[1].create(msg_size, DataType<T>::type);
|
||||
r[1].create(msg_size, DataType<T>::type);
|
||||
|
||||
disp_selected_pyr[0].create(msg_size, DataType<T>::type);
|
||||
disp_selected_pyr[1].create(msg_size, DataType<T>::type);
|
||||
|
||||
data_cost.create(data_cost_size, DataType<T>::type);
|
||||
data_cost_selected.create(msg_size, DataType<T>::type);
|
||||
|
||||
Size temp_size = data_cost_size;
|
||||
if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp)
|
||||
temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp);
|
||||
|
||||
temp.create(temp_size, DataType<T>::type);
|
||||
temp = zero;
|
||||
|
||||
///////////////////////////////// Compute////////////////////////////////////////////////
|
||||
|
||||
//csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
|
||||
// rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
|
||||
|
||||
l[0] = zero;
|
||||
d[0] = zero;
|
||||
r[0] = zero;
|
||||
u[0] = zero;
|
||||
disp_selected_pyr[0] = zero;
|
||||
|
||||
l[1] = zero;
|
||||
d[1] = zero;
|
||||
r[1] = zero;
|
||||
u[1] = zero;
|
||||
disp_selected_pyr[1] = zero;
|
||||
|
||||
data_cost = zero;
|
||||
|
||||
data_cost_selected = zero;
|
||||
|
||||
int cur_idx = 0;
|
||||
|
||||
for (int i = levels - 1; i >= 0; i--)
|
||||
{
|
||||
if (i == levels - 1)
|
||||
{
|
||||
cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data,
|
||||
data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i],
|
||||
i, nr_plane_pyr[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::ocl::stereoCSBP::compute_data_cost(
|
||||
disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0],
|
||||
step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i,
|
||||
nr_plane_pyr[i + 1]);
|
||||
|
||||
int new_idx = (cur_idx + 1) & 1;
|
||||
|
||||
cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data,
|
||||
u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
|
||||
disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data,
|
||||
data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0],
|
||||
step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1],
|
||||
cols_pyr[i + 1], nr_plane_pyr[i + 1]);
|
||||
cur_idx = new_idx;
|
||||
}
|
||||
cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
|
||||
data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp,
|
||||
rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]);
|
||||
}
|
||||
|
||||
if (disp.empty())
|
||||
disp.create(rows, cols, CV_16S);
|
||||
|
||||
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
|
||||
out = zero;
|
||||
|
||||
stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
|
||||
data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0],
|
||||
out, nr_plane_pyr[0]);
|
||||
if (disp.type() != CV_16S)
|
||||
out.convertTo(disp, disp.type());
|
||||
}
|
||||
|
||||
|
||||
typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
|
||||
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
|
||||
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp);
|
||||
|
||||
const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
|
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
|
||||
{
|
||||
|
||||
CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
|
||||
operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out,
|
||||
left, right, disp);
|
||||
}
|
||||
|
||||
#endif /* !defined (HAVE_OPENCL) */
|
@ -72,28 +72,21 @@ namespace stereoBM
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap)
|
||||
{
|
||||
Context *clCxt = input.clCxt;
|
||||
|
||||
String kernelName = "prefilter_xsobel";
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
|
||||
|
||||
size_t blockSize = 1;
|
||||
size_t globalThreads[3] = { input.cols, input.rows, 1 };
|
||||
size_t localThreads[3] = { blockSize, blockSize, 1 };
|
||||
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap));
|
||||
|
||||
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish((cl_command_queue)clCxt->oclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap));
|
||||
|
||||
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
|
||||
globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////common////////////////////////////////////
|
||||
@ -113,16 +106,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
|
||||
{
|
||||
int winsz2 = winSize >> 1;
|
||||
|
||||
Context *clCxt = left.clCxt;
|
||||
|
||||
String kernelName = "stereoKernel";
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
|
||||
|
||||
disp.setTo(Scalar_<unsigned char>::all(0));
|
||||
minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF));
|
||||
|
||||
size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize();
|
||||
size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
|
||||
size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
|
||||
sizeof(cl_uint);
|
||||
//size_t blockSize = 1;
|
||||
size_t localThreads[] = { BLOCK_W, 1,1};
|
||||
@ -131,26 +121,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
|
||||
1
|
||||
};
|
||||
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL));
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp));
|
||||
args.push_back(std::make_pair(local_mem_size, (void *)NULL));
|
||||
|
||||
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
|
||||
clFinish((cl_command_queue)clCxt->oclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
char opt [128];
|
||||
sprintf(opt, "-D radius=%d", winsz2);
|
||||
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
|
||||
globalThreads, localThreads, args, -1, -1, opt);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////postfilter_textureness///////////////////////
|
||||
@ -158,10 +145,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
|
||||
static void postfilter_textureness(oclMat &left, int winSize,
|
||||
float avergeTexThreshold, oclMat &disparity)
|
||||
{
|
||||
Context *clCxt = left.clCxt;
|
||||
|
||||
String kernelName = "textureness_kernel";
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
|
||||
|
||||
size_t blockSize = 1;
|
||||
size_t localThreads[] = { BLOCK_W, blockSize ,1};
|
||||
@ -172,22 +156,19 @@ static void postfilter_textureness(oclMat &left, int winSize,
|
||||
|
||||
size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
|
||||
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads);
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold));
|
||||
openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL));
|
||||
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
|
||||
globalThreads, localThreads, 0, NULL, NULL));
|
||||
|
||||
clFinish((cl_command_queue)clCxt->oclCommandQueue());
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize));
|
||||
args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold));
|
||||
args.push_back(std::make_pair(local_mem_size, (void*)NULL));
|
||||
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
|
||||
globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////operator/////////////////////////////////
|
||||
|
475
modules/ocl/src/tvl1flow.cpp
Normal file
475
modules/ocl/src/tvl1flow.cpp
Normal file
@ -0,0 +1,475 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jin Ma, jin@multicorewareinc.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
|
||||
#include "precomp.hpp"
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
///////////////////////////OpenCL kernel strings///////////////////////////
|
||||
extern const char* tvl1flow;
|
||||
}
|
||||
}
|
||||
|
||||
cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL()
|
||||
{
|
||||
tau = 0.25;
|
||||
lambda = 0.15;
|
||||
theta = 0.3;
|
||||
nscales = 5;
|
||||
warps = 5;
|
||||
epsilon = 0.01;
|
||||
iterations = 300;
|
||||
useInitialFlow = false;
|
||||
}
|
||||
|
||||
void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy)
|
||||
{
|
||||
CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
|
||||
CV_Assert( I0.size() == I1.size() );
|
||||
CV_Assert( I0.type() == I1.type() );
|
||||
CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
|
||||
CV_Assert( nscales > 0 );
|
||||
|
||||
// allocate memory for the pyramid structure
|
||||
I0s.resize(nscales);
|
||||
I1s.resize(nscales);
|
||||
u1s.resize(nscales);
|
||||
u2s.resize(nscales);
|
||||
//I0s_step == I1s_step
|
||||
I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
|
||||
I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
|
||||
|
||||
|
||||
if (!useInitialFlow)
|
||||
{
|
||||
flowx.create(I0.size(), CV_32FC1);
|
||||
flowy.create(I0.size(), CV_32FC1);
|
||||
}
|
||||
//u1s_step != u2s_step
|
||||
u1s[0] = flowx;
|
||||
u2s[0] = flowy;
|
||||
|
||||
I1x_buf.create(I0.size(), CV_32FC1);
|
||||
I1y_buf.create(I0.size(), CV_32FC1);
|
||||
|
||||
I1w_buf.create(I0.size(), CV_32FC1);
|
||||
I1wx_buf.create(I0.size(), CV_32FC1);
|
||||
I1wy_buf.create(I0.size(), CV_32FC1);
|
||||
|
||||
grad_buf.create(I0.size(), CV_32FC1);
|
||||
rho_c_buf.create(I0.size(), CV_32FC1);
|
||||
|
||||
p11_buf.create(I0.size(), CV_32FC1);
|
||||
p12_buf.create(I0.size(), CV_32FC1);
|
||||
p21_buf.create(I0.size(), CV_32FC1);
|
||||
p22_buf.create(I0.size(), CV_32FC1);
|
||||
|
||||
diff_buf.create(I0.size(), CV_32FC1);
|
||||
|
||||
// create the scales
|
||||
for (int s = 1; s < nscales; ++s)
|
||||
{
|
||||
ocl::pyrDown(I0s[s - 1], I0s[s]);
|
||||
ocl::pyrDown(I1s[s - 1], I1s[s]);
|
||||
|
||||
if (I0s[s].cols < 16 || I0s[s].rows < 16)
|
||||
{
|
||||
nscales = s;
|
||||
break;
|
||||
}
|
||||
|
||||
if (useInitialFlow)
|
||||
{
|
||||
ocl::pyrDown(u1s[s - 1], u1s[s]);
|
||||
ocl::pyrDown(u2s[s - 1], u2s[s]);
|
||||
|
||||
//ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
|
||||
multiply(0.5, u1s[s], u1s[s]);
|
||||
//ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
|
||||
multiply(0.5, u1s[s], u2s[s]);
|
||||
}
|
||||
}
|
||||
|
||||
// pyramidal structure for computing the optical flow
|
||||
for (int s = nscales - 1; s >= 0; --s)
|
||||
{
|
||||
// compute the optical flow at the current scale
|
||||
procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
|
||||
|
||||
// if this was the last scale, finish now
|
||||
if (s == 0)
|
||||
break;
|
||||
|
||||
// otherwise, upsample the optical flow
|
||||
|
||||
// zoom the optical flow for the next finer scale
|
||||
ocl::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
|
||||
ocl::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
|
||||
|
||||
// scale the optical flow with the appropriate zoom factor
|
||||
multiply(2, u1s[s - 1], u1s[s - 1]);
|
||||
multiply(2, u2s[s - 1], u2s[s - 1]);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace ocl_tvl1flow
|
||||
{
|
||||
void centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy);
|
||||
|
||||
void warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y,
|
||||
oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy,
|
||||
oclMat &grad, oclMat &rho);
|
||||
|
||||
void estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
|
||||
oclMat &rho_c, oclMat &p11, oclMat &p12,
|
||||
oclMat &p21, oclMat &p22, oclMat &u1,
|
||||
oclMat &u2, oclMat &error, float l_t, float theta);
|
||||
|
||||
void estimateDualVariables(oclMat &u1, oclMat &u2,
|
||||
oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut);
|
||||
}
|
||||
|
||||
void cv::ocl::OpticalFlowDual_TVL1_OCL::procOneScale(const oclMat &I0, const oclMat &I1, oclMat &u1, oclMat &u2)
|
||||
{
|
||||
using namespace ocl_tvl1flow;
|
||||
|
||||
const double scaledEpsilon = epsilon * epsilon * I0.size().area();
|
||||
|
||||
CV_DbgAssert( I1.size() == I0.size() );
|
||||
CV_DbgAssert( I1.type() == I0.type() );
|
||||
CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
|
||||
CV_DbgAssert( u2.size() == u1.size() );
|
||||
|
||||
if (u1.empty())
|
||||
{
|
||||
u1.create(I0.size(), CV_32FC1);
|
||||
u1.setTo(Scalar::all(0));
|
||||
|
||||
u2.create(I0.size(), CV_32FC1);
|
||||
u2.setTo(Scalar::all(0));
|
||||
}
|
||||
|
||||
oclMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
|
||||
centeredGradient(I1, I1x, I1y);
|
||||
|
||||
oclMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
|
||||
oclMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
|
||||
oclMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
oclMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
p11.setTo(Scalar::all(0));
|
||||
p12.setTo(Scalar::all(0));
|
||||
p21.setTo(Scalar::all(0));
|
||||
p22.setTo(Scalar::all(0));
|
||||
|
||||
oclMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
|
||||
|
||||
const float l_t = static_cast<float>(lambda * theta);
|
||||
const float taut = static_cast<float>(tau / theta);
|
||||
|
||||
for (int warpings = 0; warpings < warps; ++warpings)
|
||||
{
|
||||
warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
|
||||
|
||||
double error = numeric_limits<double>::max();
|
||||
for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
|
||||
{
|
||||
estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22,
|
||||
u1, u2, diff, l_t, static_cast<float>(theta));
|
||||
|
||||
error = ocl::sum(diff)[0];
|
||||
|
||||
estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void cv::ocl::OpticalFlowDual_TVL1_OCL::collectGarbage()
|
||||
{
|
||||
I0s.clear();
|
||||
I1s.clear();
|
||||
u1s.clear();
|
||||
u2s.clear();
|
||||
|
||||
I1x_buf.release();
|
||||
I1y_buf.release();
|
||||
|
||||
I1w_buf.release();
|
||||
I1wx_buf.release();
|
||||
I1wy_buf.release();
|
||||
|
||||
grad_buf.release();
|
||||
rho_c_buf.release();
|
||||
|
||||
p11_buf.release();
|
||||
p12_buf.release();
|
||||
p21_buf.release();
|
||||
p22_buf.release();
|
||||
|
||||
diff_buf.release();
|
||||
norm_buf.release();
|
||||
}
|
||||
|
||||
void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy)
|
||||
{
|
||||
Context *clCxt = src.clCxt;
|
||||
size_t localThreads[3] = {32, 8, 1};
|
||||
size_t globalThreads[3] = {src.cols, src.rows, 1};
|
||||
|
||||
int srcElementSize = src.elemSize();
|
||||
int src_step = src.step/srcElementSize;
|
||||
|
||||
int dElememntSize = dx.elemSize();
|
||||
int dx_step = dx.step/dElememntSize;
|
||||
|
||||
String kernelName = "centeredGradientKernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&src.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&src.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&src.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&src_step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&dx.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&dy.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&dx_step));
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
}
|
||||
|
||||
void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut)
|
||||
{
|
||||
Context *clCxt = u1.clCxt;
|
||||
|
||||
size_t localThread[] = {32, 8, 1};
|
||||
size_t globalThread[] =
|
||||
{
|
||||
u1.cols,
|
||||
u1.rows,
|
||||
1
|
||||
};
|
||||
|
||||
int u1_element_size = u1.elemSize();
|
||||
int u1_step = u1.step/u1_element_size;
|
||||
|
||||
int u2_element_size = u2.elemSize();
|
||||
int u2_step = u2.step/u2_element_size;
|
||||
|
||||
int p11_element_size = p11.elemSize();
|
||||
int p11_step = p11.step/p11_element_size;
|
||||
|
||||
int u1_offset_y = u1.offset/u1.step;
|
||||
int u1_offset_x = u1.offset%u1.step;
|
||||
u1_offset_x = u1_offset_x/u1.elemSize();
|
||||
|
||||
int u2_offset_y = u2.offset/u2.step;
|
||||
int u2_offset_x = u2.offset%u2.step;
|
||||
u2_offset_x = u2_offset_x/u2.elemSize();
|
||||
|
||||
String kernelName = "estimateDualVariablesKernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&p11_step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void*)&taut));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
|
||||
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
|
||||
}
|
||||
|
||||
void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
|
||||
oclMat &rho_c, oclMat &p11, oclMat &p12,
|
||||
oclMat &p21, oclMat &p22, oclMat &u1,
|
||||
oclMat &u2, oclMat &error, float l_t, float theta)
|
||||
{
|
||||
Context* clCxt = I1wx.clCxt;
|
||||
|
||||
size_t localThread[] = {32, 8, 1};
|
||||
size_t globalThread[] =
|
||||
{
|
||||
I1wx.cols,
|
||||
I1wx.rows,
|
||||
1
|
||||
};
|
||||
|
||||
int I1wx_element_size = I1wx.elemSize();
|
||||
int I1wx_step = I1wx.step/I1wx_element_size;
|
||||
|
||||
int u1_element_size = u1.elemSize();
|
||||
int u1_step = u1.step/u1_element_size;
|
||||
|
||||
int u2_element_size = u2.elemSize();
|
||||
int u2_step = u2.step/u2_element_size;
|
||||
|
||||
int u1_offset_y = u1.offset/u1.step;
|
||||
int u1_offset_x = u1.offset%u1.step;
|
||||
u1_offset_x = u1_offset_x/u1.elemSize();
|
||||
|
||||
int u2_offset_y = u2.offset/u2.step;
|
||||
int u2_offset_x = u2.offset%u2.step;
|
||||
u2_offset_x = u2_offset_x/u2.elemSize();
|
||||
|
||||
String kernelName = "estimateUKernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx_step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&rho_c.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&error.data));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void*)&l_t));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void*)&theta));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
|
||||
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
|
||||
}
|
||||
|
||||
void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, oclMat &grad, oclMat &rho)
|
||||
{
|
||||
Context* clCxt = I0.clCxt;
|
||||
const bool isImgSupported = support_image2d(clCxt);
|
||||
|
||||
CV_Assert(isImgSupported);
|
||||
|
||||
int u1ElementSize = u1.elemSize();
|
||||
int u1Step = u1.step/u1ElementSize;
|
||||
|
||||
int u2ElementSize = u2.elemSize();
|
||||
int u2Step = u2.step/u2ElementSize;
|
||||
|
||||
int I0ElementSize = I0.elemSize();
|
||||
int I0Step = I0.step/I0ElementSize;
|
||||
|
||||
int I1w_element_size = I1w.elemSize();
|
||||
int I1w_step = I1w.step/I1w_element_size;
|
||||
|
||||
int u1_offset_y = u1.offset/u1.step;
|
||||
int u1_offset_x = u1.offset%u1.step;
|
||||
u1_offset_x = u1_offset_x/u1.elemSize();
|
||||
|
||||
int u2_offset_y = u2.offset/u2.step;
|
||||
int u2_offset_x = u2.offset%u2.step;
|
||||
u2_offset_x = u2_offset_x/u2.elemSize();
|
||||
|
||||
size_t localThread[] = {32, 8, 1};
|
||||
size_t globalThread[] =
|
||||
{
|
||||
I0.cols,
|
||||
I0.rows,
|
||||
1
|
||||
};
|
||||
|
||||
cl_mem I1_tex;
|
||||
cl_mem I1x_tex;
|
||||
cl_mem I1y_tex;
|
||||
I1_tex = bindTexture(I1);
|
||||
I1x_tex = bindTexture(I1x);
|
||||
I1y_tex = bindTexture(I1y);
|
||||
|
||||
String kernelName = "warpBackwardKernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I0.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I0Step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I0.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I0.rows));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1_tex));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1x_tex));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1y_tex));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1Step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1w.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&rho.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1w_step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2Step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
|
||||
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
|
||||
}
|
@ -1,120 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// Intel License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
|
||||
#define __OPENCV_TEST_INTERPOLATION_HPP__
|
||||
|
||||
template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
|
||||
{
|
||||
if (border_type == cv::BORDER_CONSTANT)
|
||||
return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
|
||||
|
||||
return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
|
||||
}
|
||||
|
||||
template <typename T> struct NearestInterpolator
|
||||
{
|
||||
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
|
||||
{
|
||||
return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct LinearInterpolator
|
||||
{
|
||||
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
|
||||
{
|
||||
x -= 0.5f;
|
||||
y -= 0.5f;
|
||||
|
||||
int x1 = cvFloor(x);
|
||||
int y1 = cvFloor(y);
|
||||
int x2 = x1 + 1;
|
||||
int y2 = y1 + 1;
|
||||
|
||||
float res = 0;
|
||||
|
||||
res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
|
||||
res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
|
||||
res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
|
||||
res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
|
||||
|
||||
return cv::saturate_cast<T>(res);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct CubicInterpolator
|
||||
{
|
||||
static float getValue(float p[4], float x)
|
||||
{
|
||||
return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
|
||||
}
|
||||
|
||||
static float getValue(float p[4][4], float x, float y)
|
||||
{
|
||||
float arr[4];
|
||||
|
||||
arr[0] = getValue(p[0], x);
|
||||
arr[1] = getValue(p[1], x);
|
||||
arr[2] = getValue(p[2], x);
|
||||
arr[3] = getValue(p[3], x);
|
||||
|
||||
return getValue(arr, y);
|
||||
}
|
||||
|
||||
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
|
||||
{
|
||||
int ix = cvRound(x);
|
||||
int iy = cvRound(y);
|
||||
|
||||
float vals[4][4] =
|
||||
{
|
||||
{readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
|
||||
{readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
|
||||
{readVal<T>(src, iy , ix - 2, c, border_type, borderVal), readVal<T>(src, iy , ix - 1, c, border_type, borderVal), readVal<T>(src, iy , ix, c, border_type, borderVal), readVal<T>(src, iy , ix + 1, c, border_type, borderVal)},
|
||||
{readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
|
||||
};
|
||||
|
||||
return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __OPENCV_TEST_INTERPOLATION_HPP__
|
@ -68,9 +68,7 @@
|
||||
#include "opencv2/ocl.hpp"
|
||||
|
||||
#include "utility.hpp"
|
||||
#include "interpolation.hpp"
|
||||
|
||||
#include "opencv2/core/private.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
@ -21,6 +22,7 @@
|
||||
// Jiang Liyuan,jlyuan001.good@163.com
|
||||
// Rock Li, Rock.Li@amd.com
|
||||
// Zailong Wu, bullet@yeah.net
|
||||
// Yao Wang, bitwangyaoyao@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -87,14 +89,13 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
|
||||
int maskx;
|
||||
int masky;
|
||||
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat mat2_roi;
|
||||
cv::Mat mask_roi;
|
||||
cv::Mat dst_roi;
|
||||
cv::Mat dst1_roi; //bak
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
cv::ocl::oclMat gdst1_whole; //bak
|
||||
@ -125,10 +126,6 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
|
||||
|
||||
val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
|
||||
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
@ -175,14 +172,22 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
|
||||
gmask = mask_roi; //end
|
||||
}
|
||||
|
||||
void Near(double threshold = 0.)
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
|
||||
}
|
||||
|
||||
void Near1(double threshold = 0.)
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
|
||||
}
|
||||
|
||||
};
|
||||
////////////////////////////////lut/////////////////////////////////////////////////
|
||||
|
||||
struct Lut : ArithmTestBase {};
|
||||
#define VARNAME(A) string(#A);
|
||||
|
||||
|
||||
|
||||
TEST_P(Lut, Mat)
|
||||
{
|
||||
|
||||
@ -203,20 +208,12 @@ TEST_P(Lut, Mat)
|
||||
|
||||
cv::LUT(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::LUT(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download (cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0, s);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
////////////////////////////////exp/////////////////////////////////////////////////
|
||||
|
||||
struct Exp : ArithmTestBase {};
|
||||
|
||||
TEST_P(Exp, Mat)
|
||||
@ -227,20 +224,12 @@ TEST_P(Exp, Mat)
|
||||
|
||||
cv::exp(mat1_roi, dst_roi);
|
||||
cv::ocl::exp(gmat1, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 2, s);
|
||||
|
||||
Near(2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////log/////////////////////////////////////////////////
|
||||
|
||||
struct Log : ArithmTestBase {};
|
||||
|
||||
TEST_P(Log, Mat)
|
||||
@ -249,24 +238,14 @@ TEST_P(Log, Mat)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
|
||||
cv::log(mat1_roi, dst_roi);
|
||||
cv::ocl::log(gmat1, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1, s);
|
||||
|
||||
Near(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
////////////////////////////////add/////////////////////////////////////////////////
|
||||
|
||||
struct Add : ArithmTestBase {};
|
||||
|
||||
TEST_P(Add, Mat)
|
||||
@ -277,12 +256,7 @@ TEST_P(Add, Mat)
|
||||
|
||||
cv::add(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::add(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -294,14 +268,10 @@ TEST_P(Add, Mat_Mask)
|
||||
|
||||
cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
|
||||
cv::ocl::add(gmat1, gmat2, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(Add, Scalar)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
@ -310,12 +280,7 @@ TEST_P(Add, Scalar)
|
||||
|
||||
cv::add(mat1_roi, val, dst_roi);
|
||||
cv::ocl::add(gmat1, val, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -327,12 +292,7 @@ TEST_P(Add, Scalar_Mask)
|
||||
|
||||
cv::add(mat1_roi, val, dst_roi, mask_roi);
|
||||
cv::ocl::add(gmat1, val, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -349,12 +309,7 @@ TEST_P(Sub, Mat)
|
||||
|
||||
cv::subtract(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::subtract(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -366,14 +321,10 @@ TEST_P(Sub, Mat_Mask)
|
||||
|
||||
cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
|
||||
cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(Sub, Scalar)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
@ -382,12 +333,7 @@ TEST_P(Sub, Scalar)
|
||||
|
||||
cv::subtract(mat1_roi, val, dst_roi);
|
||||
cv::ocl::subtract(gmat1, val, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -399,12 +345,7 @@ TEST_P(Sub, Scalar_Mask)
|
||||
|
||||
cv::subtract(mat1_roi, val, dst_roi, mask_roi);
|
||||
cv::ocl::subtract(gmat1, val, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -421,12 +362,7 @@ TEST_P(Mul, Mat)
|
||||
|
||||
cv::multiply(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::multiply(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char s[1024];
|
||||
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -441,12 +377,7 @@ TEST_P(Mul, Mat_Scalar)
|
||||
|
||||
cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
|
||||
cv::ocl::multiply(gmat1, gmat2, gdst, s);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.001, sss);
|
||||
Near(.001);
|
||||
}
|
||||
}
|
||||
|
||||
@ -462,13 +393,7 @@ TEST_P(Div, Mat)
|
||||
|
||||
cv::divide(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::divide(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
|
||||
Near(1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -483,13 +408,7 @@ TEST_P(Div, Mat_Scalar)
|
||||
|
||||
cv::divide(mat1_roi, mat2_roi, dst_roi, s);
|
||||
cv::ocl::divide(gmat1, gmat2, gdst, s);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.001, sss);
|
||||
Near(.001);
|
||||
}
|
||||
}
|
||||
|
||||
@ -504,13 +423,7 @@ TEST_P(Absdiff, Mat)
|
||||
|
||||
cv::absdiff(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::absdiff(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -522,13 +435,7 @@ TEST_P(Absdiff, Mat_Scalar)
|
||||
|
||||
cv::absdiff(mat1_roi, val, dst_roi);
|
||||
cv::ocl::absdiff(gmat1, val, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -544,16 +451,8 @@ TEST_P(CartToPolar, angleInDegree)
|
||||
|
||||
cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
|
||||
cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
cv::Mat cpu_dst1;
|
||||
gdst1_whole.download(cpu_dst1);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
|
||||
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
|
||||
Near(.5);
|
||||
Near1(.5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -565,22 +464,12 @@ TEST_P(CartToPolar, angleInRadians)
|
||||
|
||||
cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
|
||||
cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
cv::Mat cpu_dst1;
|
||||
gdst1_whole.download(cpu_dst1);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
|
||||
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
|
||||
Near(.5);
|
||||
Near1(.5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
struct PolarToCart : ArithmTestBase {};
|
||||
|
||||
TEST_P(PolarToCart, angleInDegree)
|
||||
@ -591,17 +480,8 @@ TEST_P(PolarToCart, angleInDegree)
|
||||
|
||||
cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
|
||||
cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
cv::Mat cpu_dst1;
|
||||
gdst1_whole.download(cpu_dst1);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
|
||||
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
|
||||
Near(.5);
|
||||
Near1(.5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -613,17 +493,8 @@ TEST_P(PolarToCart, angleInRadians)
|
||||
|
||||
cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
|
||||
cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
cv::Mat cpu_dst1;
|
||||
gdst1_whole.download(cpu_dst1);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
|
||||
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
|
||||
Near(.5);
|
||||
Near1(.5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -640,19 +511,11 @@ TEST_P(Magnitude, Mat)
|
||||
|
||||
cv::magnitude(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::magnitude(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
struct Transpose : ArithmTestBase {};
|
||||
|
||||
TEST_P(Transpose, Mat)
|
||||
@ -663,20 +526,11 @@ TEST_P(Transpose, Mat)
|
||||
|
||||
cv::transpose(mat1_roi, dst_roi);
|
||||
cv::ocl::transpose(gmat1, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
struct Flip : ArithmTestBase {};
|
||||
|
||||
TEST_P(Flip, X)
|
||||
@ -687,13 +541,7 @@ TEST_P(Flip, X)
|
||||
|
||||
cv::flip(mat1_roi, dst_roi, 0);
|
||||
cv::ocl::flip(gmat1, gdst, 0);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -705,13 +553,7 @@ TEST_P(Flip, Y)
|
||||
|
||||
cv::flip(mat1_roi, dst_roi, 1);
|
||||
cv::ocl::flip(gmat1, gdst, 1);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -723,18 +565,11 @@ TEST_P(Flip, BOTH)
|
||||
|
||||
cv::flip(mat1_roi, dst_roi, -1);
|
||||
cv::ocl::flip(gmat1, gdst, -1);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct MinMax : ArithmTestBase {};
|
||||
|
||||
TEST_P(MinMax, MAT)
|
||||
@ -765,12 +600,8 @@ TEST_P(MinMax, MAT)
|
||||
double minVal_, maxVal_;
|
||||
cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
|
||||
|
||||
//check results
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal);
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
@ -803,12 +634,8 @@ TEST_P(MinMax, MASK)
|
||||
double minVal_, maxVal_;
|
||||
cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
|
||||
|
||||
//check results
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal);
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
|
||||
}
|
||||
}
|
||||
|
||||
@ -919,17 +746,13 @@ TEST_P(MinMaxLoc, MAT)
|
||||
error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
|
||||
}
|
||||
|
||||
//check results
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal);
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
|
||||
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal);
|
||||
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal);
|
||||
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal) << sss;
|
||||
|
||||
EXPECT_DOUBLE_EQ(error0, 0.0) << sss;
|
||||
EXPECT_DOUBLE_EQ(error1, 0.0) << sss;
|
||||
EXPECT_DOUBLE_EQ(error0, 0.0);
|
||||
EXPECT_DOUBLE_EQ(error1, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1040,17 +863,13 @@ TEST_P(MinMaxLoc, MASK)
|
||||
error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
|
||||
}
|
||||
|
||||
//check results
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal);
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
|
||||
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal);
|
||||
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal);
|
||||
|
||||
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal) << sss;
|
||||
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal) << sss;
|
||||
|
||||
EXPECT_DOUBLE_EQ(error0, 0.0) << sss;
|
||||
EXPECT_DOUBLE_EQ(error1, 0.0) << sss;
|
||||
EXPECT_DOUBLE_EQ(error0, 0.0);
|
||||
EXPECT_DOUBLE_EQ(error1, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1064,14 +883,12 @@ TEST_P(Sum, MAT)
|
||||
random_roi();
|
||||
Scalar cpures = cv::sum(mat1_roi);
|
||||
Scalar gpures = cv::ocl::sum(gmat1);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
//check results
|
||||
EXPECT_NEAR(cpures[0], gpures[0], 0.1) << sss;
|
||||
EXPECT_NEAR(cpures[1], gpures[1], 0.1) << sss;
|
||||
EXPECT_NEAR(cpures[2], gpures[2], 0.1) << sss;
|
||||
EXPECT_NEAR(cpures[3], gpures[3], 0.1) << sss;
|
||||
EXPECT_NEAR(cpures[0], gpures[0], 0.1);
|
||||
EXPECT_NEAR(cpures[1], gpures[1], 0.1);
|
||||
EXPECT_NEAR(cpures[2], gpures[2], 0.1);
|
||||
EXPECT_NEAR(cpures[3], gpures[3], 0.1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1086,11 +903,7 @@ TEST_P(CountNonZero, MAT)
|
||||
int cpures = cv::countNonZero(mat1_roi);
|
||||
int gpures = cv::ocl::countNonZero(gmat1);
|
||||
|
||||
//check results
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_DOUBLE_EQ((double)cpures, (double)gpures) << sss;
|
||||
EXPECT_DOUBLE_EQ((double)cpures, (double)gpures);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1112,13 +925,7 @@ TEST_P(Phase, Mat)
|
||||
random_roi();
|
||||
cv::phase(mat1_roi, mat2_roi, dst_roi, angelInDegrees ? true : false);
|
||||
cv::ocl::phase(gmat1, gmat2, gdst, angelInDegrees ? true : false);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-2, sss);
|
||||
Near(1e-2);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1135,13 +942,7 @@ TEST_P(Bitwise_and, Mat)
|
||||
|
||||
cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::bitwise_and(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1153,15 +954,10 @@ TEST_P(Bitwise_and, Mat_Mask)
|
||||
|
||||
cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
|
||||
cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(Bitwise_and, Scalar)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
@ -1170,14 +966,7 @@ TEST_P(Bitwise_and, Scalar)
|
||||
|
||||
cv::bitwise_and(mat1_roi, val, dst_roi);
|
||||
cv::ocl::bitwise_and(gmat1, val, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1189,14 +978,7 @@ TEST_P(Bitwise_and, Scalar_Mask)
|
||||
|
||||
cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
|
||||
cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char *sss = new char[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
delete[] sss;
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1214,13 +996,7 @@ TEST_P(Bitwise_or, Mat)
|
||||
|
||||
cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::bitwise_or(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1232,15 +1008,10 @@ TEST_P(Bitwise_or, Mat_Mask)
|
||||
|
||||
cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
|
||||
cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(Bitwise_or, Scalar)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
@ -1249,13 +1020,7 @@ TEST_P(Bitwise_or, Scalar)
|
||||
|
||||
cv::bitwise_or(mat1_roi, val, dst_roi);
|
||||
cv::ocl::bitwise_or(gmat1, val, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1267,13 +1032,7 @@ TEST_P(Bitwise_or, Scalar_Mask)
|
||||
|
||||
cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
|
||||
cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1291,13 +1050,7 @@ TEST_P(Bitwise_xor, Mat)
|
||||
|
||||
cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
|
||||
cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1309,15 +1062,10 @@ TEST_P(Bitwise_xor, Mat_Mask)
|
||||
|
||||
cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
|
||||
cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(Bitwise_xor, Scalar)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
@ -1326,13 +1074,7 @@ TEST_P(Bitwise_xor, Scalar)
|
||||
|
||||
cv::bitwise_xor(mat1_roi, val, dst_roi);
|
||||
cv::ocl::bitwise_xor(gmat1, val, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1344,13 +1086,7 @@ TEST_P(Bitwise_xor, Scalar_Mask)
|
||||
|
||||
cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
|
||||
cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1367,13 +1103,7 @@ TEST_P(Bitwise_not, Mat)
|
||||
|
||||
cv::bitwise_not(mat1_roi, dst_roi);
|
||||
cv::ocl::bitwise_not(gmat1, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1390,7 +1120,7 @@ TEST_P(Compare, Mat)
|
||||
}
|
||||
|
||||
int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
|
||||
const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
|
||||
//const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
|
||||
int cmp_num = sizeof(cmp_codes) / sizeof(int);
|
||||
|
||||
for (int i = 0; i < cmp_num; ++i)
|
||||
@ -1402,13 +1132,7 @@ TEST_P(Compare, Mat)
|
||||
|
||||
cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]);
|
||||
cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "cmptype=%s, roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", cmp_str[i], roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
|
||||
Near(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1430,14 +1154,7 @@ TEST_P(Pow, Mat)
|
||||
double p = 4.5;
|
||||
cv::pow(mat1_roi, p, dst_roi);
|
||||
cv::ocl::pow(gmat1, p, gdst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
|
||||
Near(1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1448,36 +1165,18 @@ TEST_P(MagnitudeSqr, Mat)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
// random_roi();
|
||||
// int64 start, end;
|
||||
// start = cv::getTickCount();
|
||||
random_roi();
|
||||
for(int i = 0; i < mat1.rows; ++i)
|
||||
for(int j = 0; j < mat1.cols; ++j)
|
||||
{
|
||||
float val1 = mat1.at<float>(i, j);
|
||||
float val2 = mat2.at<float>(i, j);
|
||||
|
||||
((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
|
||||
|
||||
// float val1 =((float *)( mat1.data))[(i*mat1.step/8 +j)*2];
|
||||
//
|
||||
// float val2 =((float *)( mat1.data))[(i*mat1.step/8 +j)*2+ 1 ];
|
||||
|
||||
// ((float *)(dst.data))[i*dst.step/4 +j]= val1 * val1 +val2 * val2;
|
||||
}
|
||||
// end = cv::getTickCount();
|
||||
|
||||
|
||||
|
||||
cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst;
|
||||
cv::ocl::magnitudeSqr(clmat1, clmat2, cldst);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
cldst.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
|
||||
cv::ocl::oclMat clmat1(mat1), clmat2(mat2);
|
||||
cv::ocl::magnitudeSqr(clmat1, clmat2, gdst);
|
||||
Near(1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1498,21 +1197,13 @@ TEST_P(AddWeighted, Mat)
|
||||
|
||||
cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
|
||||
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//********test****************
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
|
||||
|
@ -1,3 +1,47 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#include "precomp.hpp"
|
||||
#include <iomanip>
|
||||
|
||||
@ -33,20 +77,14 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we
|
||||
|
||||
PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
|
||||
{
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
cv::Size size;
|
||||
int type;
|
||||
bool useRoi;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
//devInfo = GET_PARAM(0);
|
||||
size = GET_PARAM(0);
|
||||
type = GET_PARAM(1);
|
||||
/*useRoi = GET_PARAM(3);*/
|
||||
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
}
|
||||
};
|
||||
|
||||
@ -59,12 +97,9 @@ TEST_P(Blend, Accuracy)
|
||||
cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
|
||||
cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
|
||||
|
||||
cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F);
|
||||
cv::ocl::oclMat dst(size, type);
|
||||
gimg1.upload(img1);
|
||||
gimg2.upload(img2);
|
||||
gweights1.upload(weights1);
|
||||
gweights2.upload(weights2);
|
||||
cv::ocl::oclMat gimg1(img1), gimg2(img2), gweights1(weights1), gweights2(weights2);
|
||||
cv::ocl::oclMat dst;
|
||||
|
||||
cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
|
||||
cv::Mat result;
|
||||
cv::Mat result_gold;
|
||||
@ -74,10 +109,10 @@ TEST_P(Blend, Accuracy)
|
||||
else
|
||||
blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
|
||||
|
||||
EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f, 0);
|
||||
EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend, Combine(
|
||||
DIFFERENT_SIZES,
|
||||
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
|
||||
));
|
||||
|
@ -7,12 +7,16 @@
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// Intel License Agreement
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
@ -21,12 +25,12 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
|
@ -129,9 +129,69 @@ TEST_P(StereoMatchBP, Regression)
|
||||
bp(d_left, d_right, d_disp);
|
||||
d_disp.download(disp);
|
||||
disp.convertTo(disp, disp_gold.depth());
|
||||
EXPECT_MAT_NEAR(disp_gold, disp, 0.0, "");
|
||||
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64),
|
||||
testing::Values(8),testing::Values(2),testing::Values(25.0f),
|
||||
testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f)));
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// ConstSpaceBeliefPropagation
|
||||
PARAM_TEST_CASE(StereoMatchConstSpaceBP, int, int, int, int, float, float, float, float, int, int)
|
||||
{
|
||||
int ndisp_;
|
||||
int iters_;
|
||||
int levels_;
|
||||
int nr_plane_;
|
||||
float max_data_term_;
|
||||
float data_weight_;
|
||||
float max_disc_term_;
|
||||
float disc_single_jump_;
|
||||
int min_disp_th_;
|
||||
int msg_type_;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
ndisp_ = GET_PARAM(0);
|
||||
iters_ = GET_PARAM(1);
|
||||
levels_ = GET_PARAM(2);
|
||||
nr_plane_ = GET_PARAM(3);
|
||||
max_data_term_ = GET_PARAM(4);
|
||||
data_weight_ = GET_PARAM(5);
|
||||
max_disc_term_ = GET_PARAM(6);
|
||||
disc_single_jump_ = GET_PARAM(7);
|
||||
min_disp_th_ = GET_PARAM(8);
|
||||
msg_type_ = GET_PARAM(9);
|
||||
}
|
||||
};
|
||||
TEST_P(StereoMatchConstSpaceBP, Regression)
|
||||
{
|
||||
Mat left_image = readImage("csstereobp/aloe-L.png");
|
||||
Mat right_image = readImage("csstereobp/aloe-R.png");
|
||||
Mat disp_gold = readImage("csstereobp/aloe-disp.png", IMREAD_GRAYSCALE);
|
||||
|
||||
ocl::oclMat d_left, d_right;
|
||||
ocl::oclMat d_disp;
|
||||
|
||||
Mat disp;
|
||||
ASSERT_FALSE(left_image.empty());
|
||||
ASSERT_FALSE(right_image.empty());
|
||||
ASSERT_FALSE(disp_gold.empty());
|
||||
|
||||
d_left.upload(left_image);
|
||||
d_right.upload(right_image);
|
||||
|
||||
ocl::StereoConstantSpaceBP bp(ndisp_, iters_, levels_, nr_plane_, max_data_term_, data_weight_,
|
||||
max_disc_term_, disc_single_jump_, 0, CV_32F);
|
||||
bp(d_left, d_right, d_disp);
|
||||
d_disp.download(disp);
|
||||
disp.convertTo(disp, disp_gold.depth());
|
||||
|
||||
EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-4);
|
||||
//EXPECT_MAT_NEAR(disp_gold, disp, 1.0, "");
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchConstSpaceBP, testing::Combine(testing::Values(128),
|
||||
testing::Values(16),testing::Values(4), testing::Values(4), testing::Values(30.0f),
|
||||
testing::Values(1.0f),testing::Values(160.0f),
|
||||
testing::Values(10.0f), testing::Values(0), testing::Values(CV_32F)));
|
||||
#endif // HAVE_OPENCL
|
||||
|
@ -103,7 +103,7 @@ PARAM_TEST_CASE(CvtColor, cv::Size, MatDepth)
|
||||
cv::cvtColor(src, dst_gold, CVTCODE(name));\
|
||||
cv::Mat dst_mat;\
|
||||
dst.download(dst_mat);\
|
||||
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");\
|
||||
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);\
|
||||
}
|
||||
|
||||
//add new ones here using macro
|
||||
@ -144,7 +144,7 @@ TEST_P(CvtColor_Gray2RGB, Accuracy)
|
||||
cv::cvtColor(src, dst_gold, code);
|
||||
cv::Mat dst_mat;
|
||||
dst.download(dst_mat);
|
||||
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");
|
||||
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);
|
||||
}
|
||||
|
||||
|
||||
@ -174,7 +174,7 @@ TEST_P(CvtColor_YUV420, Accuracy)
|
||||
cv::Mat dst_mat;
|
||||
dst.download(dst_mat);
|
||||
MAT_DIFF(dst_mat, dst_gold);
|
||||
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");
|
||||
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor, testing::Combine(
|
||||
|
@ -47,27 +47,16 @@
|
||||
#include "precomp.hpp"
|
||||
#include <iomanip>
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// ColumnSum
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// ColumnSum
|
||||
|
||||
PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
|
||||
PARAM_TEST_CASE(ColumnSum, cv::Size)
|
||||
{
|
||||
cv::Size size;
|
||||
cv::Mat src;
|
||||
bool useRoi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
size = GET_PARAM(0);
|
||||
useRoi = GET_PARAM(1);
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
}
|
||||
};
|
||||
|
||||
@ -99,8 +88,7 @@ TEST_P(ColumnSum, Accuracy)
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine(
|
||||
DIFFERENT_SIZES, testing::Values(Inverse(false), Inverse(true))));
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -68,7 +68,7 @@ TEST_P(Dft, C2C)
|
||||
|
||||
cv::dft(a, b_gold, dft_flags);
|
||||
cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), dft_flags);
|
||||
EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4, "");
|
||||
EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4);
|
||||
}
|
||||
|
||||
TEST_P(Dft, R2C)
|
||||
@ -81,11 +81,11 @@ TEST_P(Dft, R2C)
|
||||
cv::dft(a, b_gold, cv::DFT_COMPLEX_OUTPUT | dft_flags);
|
||||
|
||||
b_gold_roi = b_gold(cv::Rect(0, 0, d_b.cols, d_b.rows));
|
||||
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4, "");
|
||||
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4);
|
||||
|
||||
cv::Mat c_gold;
|
||||
cv::dft(b_gold, c_gold, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
|
||||
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4, "");
|
||||
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4);
|
||||
}
|
||||
|
||||
TEST_P(Dft, R2CthenC2R)
|
||||
@ -95,7 +95,7 @@ TEST_P(Dft, R2CthenC2R)
|
||||
cv::ocl::oclMat d_b, d_c;
|
||||
cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), 0);
|
||||
cv::ocl::dft(d_b, d_c, a.size(), cv::DFT_SCALE | cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT);
|
||||
EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
|
||||
EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4);
|
||||
}
|
||||
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
@ -19,6 +20,7 @@
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Zero Lin, Zero.Lin@amd.com
|
||||
// Zhang Ying, zhangying913@gmail.com
|
||||
// Yao Wang, bitwangyaoyao@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -55,121 +57,13 @@ using namespace testing;
|
||||
using namespace std;
|
||||
|
||||
|
||||
PARAM_TEST_CASE(FilterTestBase, MatType, bool)
|
||||
PARAM_TEST_CASE(FilterTestBase,
|
||||
MatType,
|
||||
cv::Size, // kernel size
|
||||
cv::Size, // dx,dy
|
||||
int // border type, or iteration
|
||||
)
|
||||
{
|
||||
int type;
|
||||
cv::Scalar val;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat1;
|
||||
cv::Mat mat2;
|
||||
cv::Mat mask;
|
||||
cv::Mat dst;
|
||||
cv::Mat dst1; //bak, for two outputs
|
||||
|
||||
// set up roi
|
||||
int roicols;
|
||||
int roirows;
|
||||
int src1x;
|
||||
int src1y;
|
||||
int src2x;
|
||||
int src2y;
|
||||
int dstx;
|
||||
int dsty;
|
||||
int maskx;
|
||||
int masky;
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat mat2_roi;
|
||||
cv::Mat mask_roi;
|
||||
cv::Mat dst_roi;
|
||||
cv::Mat dst1_roi; //bak
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
cv::ocl::oclMat gdst1_whole; //bak
|
||||
|
||||
//ocl mat with roi
|
||||
cv::ocl::oclMat gmat1;
|
||||
cv::ocl::oclMat gmat2;
|
||||
cv::ocl::oclMat gdst;
|
||||
cv::ocl::oclMat gdst1; //bak
|
||||
cv::ocl::oclMat gmask;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
mat2 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
dst1 = randomMat(rng, size, type, 5, 16, false);
|
||||
mask = randomMat(rng, size, CV_8UC1, 0, 2, false);
|
||||
|
||||
cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
|
||||
|
||||
val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
//randomize ROI
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
roicols = rng.uniform(1, mat1.cols);
|
||||
roirows = rng.uniform(1, mat1.rows);
|
||||
src1x = rng.uniform(0, mat1.cols - roicols);
|
||||
src1y = rng.uniform(0, mat1.rows - roirows);
|
||||
src2x = rng.uniform(0, mat2.cols - roicols);
|
||||
src2y = rng.uniform(0, mat2.rows - roirows);
|
||||
dstx = rng.uniform(0, dst.cols - roicols);
|
||||
dsty = rng.uniform(0, dst.rows - roirows);
|
||||
maskx = rng.uniform(0, mask.cols - roicols);
|
||||
masky = rng.uniform(0, mask.rows - roirows);
|
||||
#else
|
||||
roicols = mat1.cols;
|
||||
roirows = mat1.rows;
|
||||
src1x = 0;
|
||||
src1y = 0;
|
||||
src2x = 0;
|
||||
src2y = 0;
|
||||
dstx = 0;
|
||||
dsty = 0;
|
||||
maskx = 0;
|
||||
masky = 0;
|
||||
#endif
|
||||
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
|
||||
mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
|
||||
mask_roi = mask(Rect(maskx, masky, roicols, roirows));
|
||||
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
|
||||
dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst1_whole = dst1;
|
||||
gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gmat1 = mat1_roi;
|
||||
gmat2 = mat2_roi;
|
||||
gmask = mask_roi;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// blur
|
||||
|
||||
PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
|
||||
{
|
||||
int type;
|
||||
cv::Size ksize;
|
||||
int bordertype;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat1;
|
||||
cv::Mat dst;
|
||||
@ -185,7 +79,7 @@ PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat dst_roi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
|
||||
@ -193,23 +87,6 @@ PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
|
||||
cv::ocl::oclMat gmat1;
|
||||
cv::ocl::oclMat gdst;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
bordertype = GET_PARAM(2);
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
@ -236,10 +113,37 @@ PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
|
||||
gmat1 = mat1_roi;
|
||||
}
|
||||
|
||||
void Init(int mat_type)
|
||||
{
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
mat1 = randomMat(size, mat_type, 5, 16);
|
||||
dst = randomMat(size, mat_type, 5, 16);
|
||||
}
|
||||
|
||||
void Near(double threshold)
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// blur
|
||||
struct Blur : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
cv::Size ksize;
|
||||
int bordertype;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
bordertype = GET_PARAM(3);
|
||||
Init(type);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(Blur, Mat)
|
||||
@ -247,116 +151,36 @@ TEST_P(Blur, Mat)
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::blur(mat1_roi, dst_roi, ksize, Point(-1, -1), bordertype);
|
||||
cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss);
|
||||
Near(1.0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//Laplacian
|
||||
|
||||
PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
|
||||
struct Laplacian : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
int ksize;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat;
|
||||
cv::Mat dst;
|
||||
|
||||
// set up roi
|
||||
int roicols;
|
||||
int roirows;
|
||||
int srcx;
|
||||
int srcy;
|
||||
int dstx;
|
||||
int dsty;
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat_roi;
|
||||
cv::Mat dst_roi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
|
||||
//ocl mat with roi
|
||||
cv::ocl::oclMat gmat;
|
||||
cv::ocl::oclMat gdst;
|
||||
cv::Size ksize;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
//randomize ROI
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
roicols = rng.uniform(2, mat.cols);
|
||||
roirows = rng.uniform(2, mat.rows);
|
||||
srcx = rng.uniform(0, mat.cols - roicols);
|
||||
srcy = rng.uniform(0, mat.rows - roirows);
|
||||
dstx = rng.uniform(0, dst.cols - roicols);
|
||||
dsty = rng.uniform(0, dst.rows - roirows);
|
||||
#else
|
||||
roicols = mat.cols;
|
||||
roirows = mat.rows;
|
||||
srcx = 0;
|
||||
srcy = 0;
|
||||
dstx = 0;
|
||||
dsty = 0;
|
||||
#endif
|
||||
|
||||
mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
|
||||
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gmat = mat_roi;
|
||||
Init(type);
|
||||
}
|
||||
};
|
||||
|
||||
struct Laplacian : LaplacianTestBase {};
|
||||
|
||||
TEST_P(Laplacian, Accuracy)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::Laplacian(mat_roi, dst_roi, -1, ksize, 1);
|
||||
cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, srcx, srcy, dstx, dsty);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
cv::Laplacian(mat1_roi, dst_roi, -1, ksize.width, 1);
|
||||
cv::ocl::Laplacian(gmat1, gdst, -1, ksize.width, 1);
|
||||
Near(1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
@ -364,8 +188,7 @@ TEST_P(Laplacian, Accuracy)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// erode & dilate
|
||||
|
||||
PARAM_TEST_CASE(ErodeDilateBase, MatType, int)
|
||||
struct ErodeDilate : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
int iterations;
|
||||
@ -373,210 +196,54 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, int)
|
||||
//erode or dilate kernel
|
||||
cv::Mat kernel;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat1;
|
||||
cv::Mat dst;
|
||||
|
||||
// set up roi
|
||||
int roicols;
|
||||
int roirows;
|
||||
int src1x;
|
||||
int src1y;
|
||||
int dstx;
|
||||
int dsty;
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat dst_roi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
|
||||
//ocl mat with roi
|
||||
cv::ocl::oclMat gmat1;
|
||||
cv::ocl::oclMat gdst;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
iterations = GET_PARAM(1);
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
iterations = GET_PARAM(3);
|
||||
Init(type);
|
||||
// rng.fill(kernel, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
|
||||
kernel = randomMat(rng, Size(3, 3), CV_8UC1, 0, 3, false);
|
||||
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
//randomize ROI
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
roicols = rng.uniform(2, mat1.cols);
|
||||
roirows = rng.uniform(2, mat1.rows);
|
||||
src1x = rng.uniform(0, mat1.cols - roicols);
|
||||
src1y = rng.uniform(0, mat1.rows - roirows);
|
||||
dstx = rng.uniform(0, dst.cols - roicols);
|
||||
dsty = rng.uniform(0, dst.rows - roirows);
|
||||
#else
|
||||
roicols = mat1.cols;
|
||||
roirows = mat1.rows;
|
||||
src1x = 0;
|
||||
src1y = 0;
|
||||
dstx = 0;
|
||||
dsty = 0;
|
||||
#endif
|
||||
|
||||
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
|
||||
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
|
||||
gmat1 = mat1_roi;
|
||||
kernel = randomMat(Size(3, 3), CV_8UC1, 0, 3);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// erode
|
||||
|
||||
struct Erode : ErodeDilateBase {};
|
||||
|
||||
TEST_P(Erode, Mat)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::erode(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations);
|
||||
cv::ocl::erode(gmat1, gdst, kernel, Point(-1, -1), iterations);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// dilate
|
||||
|
||||
struct Dilate : ErodeDilateBase {};
|
||||
|
||||
TEST_P(Dilate, Mat)
|
||||
TEST_P(ErodeDilate, Mat)
|
||||
{
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
cv::erode(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations);
|
||||
cv::ocl::erode(gmat1, gdst, kernel, Point(-1, -1), iterations);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
|
||||
Near(1e-5);
|
||||
}
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
cv::dilate(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations);
|
||||
cv::ocl::dilate(gmat1, gdst, kernel, Point(-1, -1), iterations);
|
||||
Near(1e-5);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Sobel
|
||||
|
||||
PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
|
||||
struct Sobel : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
int dx, dy, ksize, bordertype;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat1;
|
||||
cv::Mat dst;
|
||||
|
||||
// set up roi
|
||||
int roicols;
|
||||
int roirows;
|
||||
int src1x;
|
||||
int src1y;
|
||||
int dstx;
|
||||
int dsty;
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat dst_roi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
|
||||
//ocl mat with roi
|
||||
cv::ocl::oclMat gmat1;
|
||||
cv::ocl::oclMat gdst;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
dx = GET_PARAM(1);
|
||||
dy = GET_PARAM(2);
|
||||
ksize = GET_PARAM(3);
|
||||
bordertype = GET_PARAM(4);
|
||||
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
Size s = GET_PARAM(1);
|
||||
ksize = s.width;
|
||||
s = GET_PARAM(2);
|
||||
dx = s.width;
|
||||
dy = s.height;
|
||||
bordertype = GET_PARAM(3);
|
||||
Init(type);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
//randomize ROI
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
roicols = rng.uniform(2, mat1.cols);
|
||||
roirows = rng.uniform(2, mat1.rows);
|
||||
src1x = rng.uniform(0, mat1.cols - roicols);
|
||||
src1y = rng.uniform(0, mat1.rows - roirows);
|
||||
dstx = rng.uniform(0, dst.cols - roicols);
|
||||
dsty = rng.uniform(0, dst.rows - roirows);
|
||||
#else
|
||||
roicols = mat1.cols;
|
||||
roirows = mat1.rows;
|
||||
src1x = 0;
|
||||
src1y = 0;
|
||||
dstx = 0;
|
||||
dsty = 0;
|
||||
#endif
|
||||
|
||||
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
|
||||
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
|
||||
gmat1 = mat1_roi;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
TEST_P(Sobel, Mat)
|
||||
@ -584,103 +251,29 @@ TEST_P(Sobel, Mat)
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::Sobel(mat1_roi, dst_roi, -1, dx, dy, ksize, /*scale*/0.00001,/*delta*/0, bordertype);
|
||||
cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
|
||||
Near(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Scharr
|
||||
|
||||
PARAM_TEST_CASE(Scharr, MatType, int, int, int)
|
||||
struct Scharr : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
int dx, dy, bordertype;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat1;
|
||||
cv::Mat dst;
|
||||
|
||||
// set up roi
|
||||
int roicols;
|
||||
int roirows;
|
||||
int src1x;
|
||||
int src1y;
|
||||
int dstx;
|
||||
int dsty;
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat dst_roi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
|
||||
//ocl mat with roi
|
||||
cv::ocl::oclMat gmat1;
|
||||
cv::ocl::oclMat gdst;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
dx = GET_PARAM(1);
|
||||
dy = GET_PARAM(2);
|
||||
Size s = GET_PARAM(2);
|
||||
dx = s.width;
|
||||
dy = s.height;
|
||||
bordertype = GET_PARAM(3);
|
||||
dx = 1;
|
||||
dy = 0;
|
||||
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
Init(type);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
//randomize ROI
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
roicols = rng.uniform(2, mat1.cols);
|
||||
roirows = rng.uniform(2, mat1.rows);
|
||||
src1x = rng.uniform(0, mat1.cols - roicols);
|
||||
src1y = rng.uniform(0, mat1.rows - roirows);
|
||||
dstx = rng.uniform(0, dst.cols - roicols);
|
||||
dsty = rng.uniform(0, dst.rows - roirows);
|
||||
#else
|
||||
roicols = mat1.cols;
|
||||
roirows = mat1.rows;
|
||||
src1x = 0;
|
||||
src1y = 0;
|
||||
dstx = 0;
|
||||
dsty = 0;
|
||||
#endif
|
||||
|
||||
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
|
||||
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
|
||||
gmat1 = mat1_roi;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
TEST_P(Scharr, Mat)
|
||||
@ -688,16 +281,9 @@ TEST_P(Scharr, Mat)
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::Scharr(mat1_roi, dst_roi, -1, dx, dy, /*scale*/1,/*delta*/0, bordertype);
|
||||
cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
|
||||
Near(1);
|
||||
}
|
||||
|
||||
}
|
||||
@ -705,89 +291,23 @@ TEST_P(Scharr, Mat)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// GaussianBlur
|
||||
|
||||
PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
|
||||
struct GaussianBlur : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
cv::Size ksize;
|
||||
int bordertype;
|
||||
|
||||
double sigma1, sigma2;
|
||||
|
||||
//src mat
|
||||
cv::Mat mat1;
|
||||
cv::Mat dst;
|
||||
|
||||
// set up roi
|
||||
int roicols;
|
||||
int roirows;
|
||||
int src1x;
|
||||
int src1y;
|
||||
int dstx;
|
||||
int dsty;
|
||||
|
||||
//src mat with roi
|
||||
cv::Mat mat1_roi;
|
||||
cv::Mat dst_roi;
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
//ocl dst mat for testing
|
||||
cv::ocl::oclMat gdst_whole;
|
||||
|
||||
//ocl mat with roi
|
||||
cv::ocl::oclMat gmat1;
|
||||
cv::ocl::oclMat gdst;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
bordertype = GET_PARAM(2);
|
||||
|
||||
bordertype = GET_PARAM(3);
|
||||
Init(type);
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
cv::Size size(MWIDTH, MHEIGHT);
|
||||
|
||||
sigma1 = rng.uniform(0.1, 1.0);
|
||||
sigma2 = rng.uniform(0.1, 1.0);
|
||||
|
||||
mat1 = randomMat(rng, size, type, 5, 16, false);
|
||||
dst = randomMat(rng, size, type, 5, 16, false);
|
||||
|
||||
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
|
||||
//CV_Assert(devnums > 0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
#ifdef RANDOMROI
|
||||
//randomize ROI
|
||||
cv::RNG &rng = TS::ptr()->get_rng();
|
||||
roicols = rng.uniform(2, mat1.cols);
|
||||
roirows = rng.uniform(2, mat1.rows);
|
||||
src1x = rng.uniform(0, mat1.cols - roicols);
|
||||
src1y = rng.uniform(0, mat1.rows - roirows);
|
||||
dstx = rng.uniform(0, dst.cols - roicols);
|
||||
dsty = rng.uniform(0, dst.rows - roirows);
|
||||
#else
|
||||
roicols = mat1.cols;
|
||||
roirows = mat1.rows;
|
||||
src1x = 0;
|
||||
src1y = 0;
|
||||
dstx = 0;
|
||||
dsty = 0;
|
||||
#endif
|
||||
|
||||
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
|
||||
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
gdst_whole = dst;
|
||||
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
|
||||
|
||||
|
||||
gmat1 = mat1_roi;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
TEST_P(GaussianBlur, Mat)
|
||||
@ -795,53 +315,53 @@ TEST_P(GaussianBlur, Mat)
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::GaussianBlur(mat1_roi, dst_roi, ksize, sigma1, sigma2, bordertype);
|
||||
cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
|
||||
|
||||
cv::Mat cpu_dst;
|
||||
gdst_whole.download(cpu_dst);
|
||||
char sss[1024];
|
||||
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss);
|
||||
Near(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)),
|
||||
Values(Size(0, 0)), //not use
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(1, 3)));
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(Size(3, 3)),
|
||||
Values(Size(0, 0)), //not use
|
||||
Values(0))); //not use
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1)));
|
||||
|
||||
//INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1)));
|
||||
|
||||
//INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
|
||||
INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(
|
||||
Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(Size(0, 0)), //not use
|
||||
Values(Size(0, 0)), //not use
|
||||
Values(1)));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(1, 2), Values(0, 1), Values(3, 5), Values((MatType)cv::BORDER_CONSTANT,
|
||||
(MatType)cv::BORDER_REPLICATE)));
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(Size(3, 3), Size(5, 5)),
|
||||
Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)),
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), Values(0, 1), Values(0, 1),
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(Size(0, 0)), //not use
|
||||
Values(Size(0, 1), Size(1, 0)),
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(cv::Size(3, 3), cv::Size(5, 5)),
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(Size(3, 3), Size(5, 5)),
|
||||
Values(Size(0, 0)), //not use
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
|
||||
|
||||
|
||||
|
||||
|
@ -53,13 +53,12 @@ PARAM_TEST_CASE(Gemm, int, cv::Size, int)
|
||||
int type;
|
||||
cv::Size mat_size;
|
||||
int flags;
|
||||
//vector<cv::ocl::Info> info;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
mat_size = GET_PARAM(1);
|
||||
flags = GET_PARAM(2);
|
||||
//cv::ocl::getDevice(info);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -12,10 +12,12 @@
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Sen Liu, swjutls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -61,40 +63,31 @@ struct getRect
|
||||
}
|
||||
};
|
||||
|
||||
PARAM_TEST_CASE(HaarTestBase, int, int)
|
||||
PARAM_TEST_CASE(Haar, double, int)
|
||||
{
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
cv::ocl::OclCascadeClassifier cascade, nestedCascade;
|
||||
cv::ocl::OclCascadeClassifierBuf cascadebuf;
|
||||
cv::CascadeClassifier cpucascade, cpunestedCascade;
|
||||
// Mat img;
|
||||
|
||||
double scale;
|
||||
int index;
|
||||
int flags;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
scale = 1.0;
|
||||
index = 0;
|
||||
scale = GET_PARAM(0);
|
||||
flags = GET_PARAM(1);
|
||||
string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
|
||||
|
||||
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
|
||||
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
|
||||
{
|
||||
cout << "ERROR: Could not load classifier cascade" << endl;
|
||||
return;
|
||||
}
|
||||
//int devnums = getDevice(oclinfo);
|
||||
//CV_Assert(devnums>0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
//cv::ocl::setBinpath("E:\\");
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////faceDetect/////////////////////////////////////////////////
|
||||
|
||||
struct Haar : HaarTestBase {};
|
||||
|
||||
TEST_F(Haar, FaceDetect)
|
||||
TEST_P(Haar, FaceDetect)
|
||||
{
|
||||
string imgName = workdir + "lena.jpg";
|
||||
Mat img = imread( imgName, 1 );
|
||||
@ -105,59 +98,65 @@ TEST_F(Haar, FaceDetect)
|
||||
return ;
|
||||
}
|
||||
|
||||
//int i = 0;
|
||||
//double t = 0;
|
||||
vector<Rect> faces, oclfaces;
|
||||
|
||||
// const static Scalar colors[] = { CV_RGB(0, 0, 255),
|
||||
// CV_RGB(0, 128, 255),
|
||||
// CV_RGB(0, 255, 255),
|
||||
// CV_RGB(0, 255, 0),
|
||||
// CV_RGB(255, 128, 0),
|
||||
// CV_RGB(255, 255, 0),
|
||||
// CV_RGB(255, 0, 0),
|
||||
// CV_RGB(255, 0, 255)
|
||||
// } ;
|
||||
|
||||
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
|
||||
MemStorage storage(cvCreateMemStorage(0));
|
||||
cvtColor( img, gray, COLOR_BGR2GRAY );
|
||||
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
|
||||
equalizeHist( smallImg, smallImg );
|
||||
|
||||
|
||||
cv::ocl::oclMat image;
|
||||
CvSeq *_objects;
|
||||
image.upload(smallImg);
|
||||
_objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
|
||||
3, 0
|
||||
| CV_HAAR_SCALE_IMAGE
|
||||
, Size(30, 30), Size(0, 0) );
|
||||
3, flags, Size(30, 30), Size(0, 0) );
|
||||
vector<CvAvgComp> vecAvgComp;
|
||||
Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
|
||||
oclfaces.resize(vecAvgComp.size());
|
||||
std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
|
||||
|
||||
cpucascade.detectMultiScale( smallImg, faces, 1.1,
|
||||
3, 0
|
||||
| CV_HAAR_SCALE_IMAGE
|
||||
, Size(30, 30), Size(0, 0) );
|
||||
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
|
||||
flags,
|
||||
Size(30, 30), Size(0, 0) );
|
||||
EXPECT_EQ(faces.size(), oclfaces.size());
|
||||
/* for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
|
||||
{
|
||||
Mat smallImgROI;
|
||||
Point center;
|
||||
Scalar color = colors[i%8];
|
||||
int radius;
|
||||
center.x = cvRound((r->x + r->width*0.5)*scale);
|
||||
center.y = cvRound((r->y + r->height*0.5)*scale);
|
||||
radius = cvRound((r->width + r->height)*0.25*scale);
|
||||
circle( img, center, radius, color, 3, 8, 0 );
|
||||
} */
|
||||
//namedWindow("result");
|
||||
//imshow("result",img);
|
||||
//waitKey(0);
|
||||
//destroyAllWindows();
|
||||
|
||||
}
|
||||
|
||||
TEST_P(Haar, FaceDetectUseBuf)
|
||||
{
|
||||
string imgName = workdir + "lena.jpg";
|
||||
Mat img = imread( imgName, 1 );
|
||||
|
||||
if(img.empty())
|
||||
{
|
||||
std::cout << "Couldn't read " << imgName << std::endl;
|
||||
return ;
|
||||
}
|
||||
|
||||
vector<Rect> faces, oclfaces;
|
||||
|
||||
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
|
||||
MemStorage storage(cvCreateMemStorage(0));
|
||||
cvtColor( img, gray, CV_BGR2GRAY );
|
||||
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
|
||||
equalizeHist( smallImg, smallImg );
|
||||
|
||||
cv::ocl::oclMat image;
|
||||
image.upload(smallImg);
|
||||
|
||||
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
|
||||
flags,
|
||||
Size(30, 30), Size(0, 0) );
|
||||
cascadebuf.release();
|
||||
|
||||
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
|
||||
flags,
|
||||
Size(30, 30), Size(0, 0) );
|
||||
EXPECT_EQ(faces.size(), oclfaces.size());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
|
||||
Combine(Values(1.0),
|
||||
Values(CV_HAAR_SCALE_IMAGE, 0)));
|
||||
|
||||
#endif // HAVE_OPENCL
|
||||
|
@ -240,12 +240,11 @@ TEST_P(HOG, Detect)
|
||||
}
|
||||
}
|
||||
|
||||
char s[100] = {0};
|
||||
EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3, s);
|
||||
EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3);
|
||||
}
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HOG, testing::Combine(
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
|
||||
testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
|
||||
testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user