diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so index fb34509bd..f62e5961f 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so differ diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so index fce58145c..c0237c1d0 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so differ diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so index 2498763d0..2c235d824 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so differ diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so index 9a7047b55..487258889 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so differ diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so index 98e0d9ea0..169d97e77 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so differ diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so index fe147823e..bdd09fb06 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so differ diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so index 646ae716e..74bfdea5e 100755 Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so index a0db0d199..7c061d9ee 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so and b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so index 6f167cb0c..686bdfb61 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so and b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so index 26d286a18..5a5c23173 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so and b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so index f29707cae..694cfb80d 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so index 85e0320a9..c6cc8ab5f 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so and b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so index 35c5141b9..94966c82e 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so and b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so differ diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so index 816669e37..8251510c4 100755 Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so differ diff --git a/3rdparty/lib/mips/libnative_camera_r4.0.3.so b/3rdparty/lib/mips/libnative_camera_r4.0.3.so index 616323de9..c8c9e2c57 100755 Binary files a/3rdparty/lib/mips/libnative_camera_r4.0.3.so and b/3rdparty/lib/mips/libnative_camera_r4.0.3.so differ diff --git a/3rdparty/lib/mips/libnative_camera_r4.1.1.so b/3rdparty/lib/mips/libnative_camera_r4.1.1.so index 7ee4f2576..6845d715d 100755 Binary files a/3rdparty/lib/mips/libnative_camera_r4.1.1.so and b/3rdparty/lib/mips/libnative_camera_r4.1.1.so differ diff --git a/3rdparty/lib/mips/libnative_camera_r4.2.0.so b/3rdparty/lib/mips/libnative_camera_r4.2.0.so index 33d7745ad..b148d1621 100755 Binary files a/3rdparty/lib/mips/libnative_camera_r4.2.0.so and b/3rdparty/lib/mips/libnative_camera_r4.2.0.so differ diff --git a/3rdparty/lib/x86/libnative_camera_r2.3.3.so b/3rdparty/lib/x86/libnative_camera_r2.3.3.so index d40409f22..d9400638b 100755 Binary files a/3rdparty/lib/x86/libnative_camera_r2.3.3.so and b/3rdparty/lib/x86/libnative_camera_r2.3.3.so differ diff --git a/3rdparty/lib/x86/libnative_camera_r3.0.1.so b/3rdparty/lib/x86/libnative_camera_r3.0.1.so index 221b833a5..cf2e9908c 100755 Binary files a/3rdparty/lib/x86/libnative_camera_r3.0.1.so and b/3rdparty/lib/x86/libnative_camera_r3.0.1.so differ diff --git a/3rdparty/lib/x86/libnative_camera_r4.0.3.so b/3rdparty/lib/x86/libnative_camera_r4.0.3.so index 786d6dccb..420ec818f 100755 Binary files a/3rdparty/lib/x86/libnative_camera_r4.0.3.so and b/3rdparty/lib/x86/libnative_camera_r4.0.3.so differ diff --git a/3rdparty/lib/x86/libnative_camera_r4.1.1.so b/3rdparty/lib/x86/libnative_camera_r4.1.1.so index 8ec6cb74d..5468d206c 100755 Binary files a/3rdparty/lib/x86/libnative_camera_r4.1.1.so and b/3rdparty/lib/x86/libnative_camera_r4.1.1.so differ diff --git a/3rdparty/lib/x86/libnative_camera_r4.2.0.so b/3rdparty/lib/x86/libnative_camera_r4.2.0.so index 7fe74d21a..992331032 100755 Binary files a/3rdparty/lib/x86/libnative_camera_r4.2.0.so and b/3rdparty/lib/x86/libnative_camera_r4.2.0.so differ diff --git a/3rdparty/libjasper/CMakeLists.txt b/3rdparty/libjasper/CMakeLists.txt index 83c0198b4..7a70a19cf 100644 --- a/3rdparty/libjasper/CMakeLists.txt +++ b/3rdparty/libjasper/CMakeLists.txt @@ -23,8 +23,8 @@ if(WIN32 AND NOT MINGW) add_definitions(-DJAS_WIN_MSVC_BUILD) endif(WIN32 AND NOT MINGW) -ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized - -Wmissing-prototypes -Wmissing-declarations -Wunused -Wshadow +ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized -Wmissing-prototypes + -Wno-unused-but-set-parameter -Wmissing-declarations -Wunused -Wshadow -Wsign-compare -Wstrict-overflow) ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4018 /wd4101 /wd4244 /wd4267 /wd4715) # vs2005 @@ -49,4 +49,3 @@ endif() if(NOT BUILD_SHARED_LIBS) install(TARGETS ${JASPER_LIBRARY} ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main) endif() - diff --git a/CMakeLists.txt b/CMakeLists.txt index b90e16628..4c757d8f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -595,12 +595,15 @@ endif() status("") status(" GUI: ") -if(HAVE_QT) +if(HAVE_QT5) + status(" QT 5.x:" HAVE_QT THEN "YES (ver ${Qt5Core_VERSION_STRING})" ELSE NO) + status(" QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${Qt5OpenGL_LIBRARIES} ${Qt5OpenGL_VERSION_STRING})" ELSE NO) +elseif(HAVE_QT) status(" QT 4.x:" HAVE_QT THEN "YES (ver ${QT_VERSION_MAJOR}.${QT_VERSION_MINOR}.${QT_VERSION_PATCH} ${QT_EDITION})" ELSE NO) status(" QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${QT_QTOPENGL_LIBRARY})" ELSE NO) else() if(DEFINED WITH_QT) - status(" QT 4.x:" NO) + status(" QT:" NO) endif() if(DEFINED WITH_WIN32UI) status(" Win32 UI:" HAVE_WIN32UI THEN YES ELSE NO) diff --git a/cmake/OpenCVDetectAndroidSDK.cmake b/cmake/OpenCVDetectAndroidSDK.cmake index bab79805d..ee4188897 100644 --- a/cmake/OpenCVDetectAndroidSDK.cmake +++ b/cmake/OpenCVDetectAndroidSDK.cmake @@ -176,7 +176,8 @@ macro(android_get_compatible_target VAR) endmacro() unset(__android_project_chain CACHE) -#add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11) + +# add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11) macro(add_android_project target path) # parse arguments set(android_proj_arglist NATIVE_DEPS LIBRARY_DEPS SDK_TARGET IGNORE_JAVA IGNORE_MANIFEST) @@ -212,6 +213,16 @@ macro(add_android_project target path) ocv_check_dependencies(${android_proj_NATIVE_DEPS} opencv_java) endif() + if(EXISTS "${path}/jni/Android.mk" ) + # find if native_app_glue is used + file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" ) + if(NATIVE_APP_GLUE) + if(ANDROID_NATIVE_API_LEVEL LESS 9 OR NOT EXISTS "${ANDROID_NDK}/sources/android/native_app_glue") + set(OCV_DEPENDENCIES_FOUND FALSE) + endif() + endif() + endif() + if(OCV_DEPENDENCIES_FOUND AND android_proj_sdk_target AND ANDROID_EXECUTABLE AND ANT_EXECUTABLE AND ANDROID_TOOLS_Pkg_Revision GREATER 13 AND EXISTS "${path}/${ANDROID_MANIFEST_FILE}") project(${target}) @@ -268,9 +279,6 @@ macro(add_android_project target path) file(STRINGS "${path}/jni/Android.mk" JNI_LIB_NAME REGEX "LOCAL_MODULE[ ]*:=[ ]*.*" ) string(REGEX REPLACE "LOCAL_MODULE[ ]*:=[ ]*([a-zA-Z_][a-zA-Z_0-9]*)[ ]*" "\\1" JNI_LIB_NAME "${JNI_LIB_NAME}") - # find using of native app glue to determine native activity - file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" ) - if(JNI_LIB_NAME) ocv_include_modules_recurse(${android_proj_NATIVE_DEPS}) ocv_include_directories("${path}/jni") @@ -291,9 +299,9 @@ macro(add_android_project target path) ) get_target_property(android_proj_jni_location "${JNI_LIB_NAME}" LOCATION) - if (NOT (CMAKE_BUILD_TYPE MATCHES "debug")) - add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}") - endif() + if (NOT (CMAKE_BUILD_TYPE MATCHES "debug")) + add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}") + endif() endif() endif() diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake index 9b841dad8..7efcba7c1 100644 --- a/cmake/OpenCVDetectCXXCompiler.cmake +++ b/cmake/OpenCVDetectCXXCompiler.cmake @@ -101,7 +101,7 @@ endif() if(MSVC64 OR MINGW64) set(X86_64 1) -elseif(MSVC AND NOT CMAKE_CROSSCOMPILING) +elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") set(X86_64 1) diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 92655228d..014066bc7 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -20,12 +20,6 @@ else(APPLE) DOC "OpenCL include directory" NO_DEFAULT_PATH) - find_path(OPENCL_INCLUDE_DIR - NAMES OpenCL/cl.h CL/cl.h - HINTS ${OPENCL_ROOT_DIR} - PATH_SUFFIXES include include/nvidia-current - DOC "OpenCL include directory") - if (X86_64) set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64) elseif (X86) @@ -39,12 +33,6 @@ else(APPLE) DOC "OpenCL library" NO_DEFAULT_PATH) - find_library(OPENCL_LIBRARY - NAMES OpenCL - HINTS ${OPENCL_ROOT_DIR} - PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES} - DOC "OpenCL library") - mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) include(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR ) diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake index 115ce338e..d606a650a 100644 --- a/cmake/OpenCVDetectPython.cmake +++ b/cmake/OpenCVDetectPython.cmake @@ -102,18 +102,12 @@ if(PYTHON_EXECUTABLE) if(BUILD_DOCS) find_host_program(SPHINX_BUILD sphinx-build) if(SPHINX_BUILD) - if(UNIX) - execute_process(COMMAND sh -c "${SPHINX_BUILD} -_ 2>&1 | sed -ne 1p" - RESULT_VARIABLE SPHINX_PROCESS - OUTPUT_VARIABLE SPHINX_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - else() - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import sphinx; print sphinx.__version__" - RESULT_VARIABLE SPHINX_PROCESS - OUTPUT_VARIABLE SPHINX_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - endif() - if(SPHINX_PROCESS EQUAL 0) + execute_process(COMMAND "${SPHINX_BUILD}" + OUTPUT_QUIET + ERROR_VARIABLE SPHINX_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(SPHINX_OUTPUT MATCHES "^Sphinx v([0-9][^ \n]*)") + set(SPHINX_VERSION "${CMAKE_MATCH_1}") set(HAVE_SPHINX 1) message(STATUS "Found Sphinx ${SPHINX_VERSION}: ${SPHINX_BUILD}") endif() diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake index 3b42f1b0b..2ea864c16 100644 --- a/cmake/OpenCVFindLibsGUI.cmake +++ b/cmake/OpenCVFindLibsGUI.cmake @@ -13,12 +13,31 @@ if(WITH_WIN32UI) endif(WITH_WIN32UI) # --- QT4 --- -ocv_clear_vars(HAVE_QT) +ocv_clear_vars(HAVE_QT HAVE_QT5) if(WITH_QT) - find_package(Qt4) - if(QT4_FOUND) - set(HAVE_QT TRUE) - add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work + if(NOT CMAKE_VERSION VERSION_LESS 2.8.3 AND NOT WITH_QT EQUAL 4) + find_package(Qt5Core) + find_package(Qt5Gui) + find_package(Qt5Widgets) + find_package(Qt5Test) + find_package(Qt5Concurrent) + if(Qt5Core_FOUND AND Qt5Gui_FOUND AND Qt5Widgets_FOUND AND Qt5Test_FOUND AND Qt5Concurrent_FOUND) + set(HAVE_QT5 ON) + set(HAVE_QT ON) + add_definitions(-DHAVE_QT) + find_package(Qt5OpenGL) + if(Qt5OpenGL_FOUND) + set(QT_QTOPENGL_FOUND ON) + endif() + endif() + endif() + + if(NOT HAVE_QT) + find_package(Qt4) + if(QT4_FOUND) + set(HAVE_QT TRUE) + add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work + endif() endif() endif() diff --git a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp index f6ec2f09c..2d0ebc7a1 100644 --- a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp +++ b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp @@ -362,6 +362,9 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback, typedef sp (*Android23ConnectFuncType)(int); typedef sp (*Android3DConnectFuncType)(int, int); + const int BACK_CAMERA_INDEX = 99; + const int FRONT_CAMERA_INDEX = 98; + enum { CAMERA_SUPPORT_MODE_2D = 0x01, /* Camera Sensor supports 2D mode. */ CAMERA_SUPPORT_MODE_3D = 0x02, /* Camera Sensor supports 3D mode. */ @@ -373,7 +376,51 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback, const char Android23ConnectName[] = "_ZN7android6Camera7connectEi"; const char Android3DConnectName[] = "_ZN7android6Camera7connectEii"; - LOGD("CameraHandler::initCameraConnect(%p, %d, %p, %p)", callback, cameraId, userData, prevCameraParameters); + int localCameraIndex = cameraId; + +#if !defined(ANDROID_r2_2_0) + if (cameraId == BACK_CAMERA_INDEX) + { + LOGD("Back camera selected"); + for (int i = 0; i < Camera::getNumberOfCameras(); i++) + { + CameraInfo info; + Camera::getCameraInfo(i, &info); + if (info.facing == CAMERA_FACING_BACK) + { + localCameraIndex = i; + break; + } + } + } + else if (cameraId == FRONT_CAMERA_INDEX) + { + LOGD("Front camera selected"); + for (int i = 0; i < Camera::getNumberOfCameras(); i++) + { + CameraInfo info; + Camera::getCameraInfo(i, &info); + if (info.facing == CAMERA_FACING_FRONT) + { + localCameraIndex = i; + break; + } + } + } + + if (localCameraIndex == BACK_CAMERA_INDEX) + { + LOGE("Back camera not found!"); + return NULL; + } + else if (localCameraIndex == FRONT_CAMERA_INDEX) + { + LOGE("Front camera not found!"); + return NULL; + } +#endif + + LOGD("CameraHandler::initCameraConnect(%p, %d, %p, %p)", callback, localCameraIndex, userData, prevCameraParameters); sp camera = 0; @@ -381,8 +428,8 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback, if (!CameraHALHandle) { - LOGE("Cannot link to \"libcamera_client.so\""); - return NULL; + LOGE("Cannot link to \"libcamera_client.so\""); + return NULL; } // reset errors @@ -390,24 +437,24 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback, if (Android22ConnectFuncType Android22Connect = (Android22ConnectFuncType)dlsym(CameraHALHandle, Android22ConnectName)) { - LOGD("Connecting to CameraService v 2.2"); - camera = Android22Connect(); + LOGD("Connecting to CameraService v 2.2"); + camera = Android22Connect(); } else if (Android23ConnectFuncType Android23Connect = (Android23ConnectFuncType)dlsym(CameraHALHandle, Android23ConnectName)) { - LOGD("Connecting to CameraService v 2.3"); - camera = Android23Connect(cameraId); + LOGD("Connecting to CameraService v 2.3"); + camera = Android23Connect(localCameraIndex); } else if (Android3DConnectFuncType Android3DConnect = (Android3DConnectFuncType)dlsym(CameraHALHandle, Android3DConnectName)) { - LOGD("Connecting to CameraService v 3D"); - camera = Android3DConnect(cameraId, CAMERA_SUPPORT_MODE_2D); + LOGD("Connecting to CameraService v 3D"); + camera = Android3DConnect(localCameraIndex, CAMERA_SUPPORT_MODE_2D); } else { - dlclose(CameraHALHandle); - LOGE("Cannot connect to CameraService. Connect method was not found!"); - return NULL; + dlclose(CameraHALHandle); + LOGE("Cannot connect to CameraService. Connect method was not found!"); + return NULL; } dlclose(CameraHALHandle); @@ -422,7 +469,7 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback, camera->setListener(handler); handler->camera = camera; - handler->cameraId = cameraId; + handler->cameraId = localCameraIndex; if (prevCameraParameters != 0) { diff --git a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst index 4dda9662d..f2fbfd1d9 100644 --- a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst +++ b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst @@ -1486,6 +1486,6 @@ The function reconstructs 3-dimensional points (in homogeneous coordinates) by u .. [SteweniusCFS] Stewénius, H., Calibrated Fivepoint solver. http://www.vis.uky.edu/~stewe/FIVEPOINT/ -.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://gregslabaugh.name/publications/euler.pdf +.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://www.soi.city.ac.uk/~sbbh653/publications/euler.pdf (verified: 2013-04-15) .. [Zhang2000] Z. Zhang. A Flexible New Technique for Camera Calibration. IEEE Transactions on Pattern Analysis and Machine Intelligence, 22(11):1330-1334, 2000. diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index b1fcdc9b0..158ff8e45 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -2850,8 +2850,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp if( _mean.data ) { - CV_Assert( _mean.size() == mean_sz ); + CV_Assert( _mean.size() == mean_sz ); _mean.convertTo(mean, ctype); + covar_flags |= CV_COVAR_USE_AVG; } calcCovarMatrix( data, covar, mean, covar_flags, ctype ); diff --git a/modules/flann/include/opencv2/flann/ground_truth.h b/modules/flann/include/opencv2/flann/ground_truth.h index 69d978ba0..fd8f3ae7f 100644 --- a/modules/flann/include/opencv2/flann/ground_truth.h +++ b/modules/flann/include/opencv2/flann/ground_truth.h @@ -42,7 +42,6 @@ template void find_nearest(const Matrix& dataset, typename Distance::ElementType* query, int* matches, int nn, int skip = 0, Distance distance = Distance()) { - typedef typename Distance::ElementType ElementType; typedef typename Distance::ResultType DistanceType; int n = nn + skip; diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt index 5955ab6c1..375d7546f 100644 --- a/modules/highgui/CMakeLists.txt +++ b/modules/highgui/CMakeLists.txt @@ -76,7 +76,26 @@ set(highgui_srcs file(GLOB highgui_ext_hdrs "include/opencv2/*.hpp" "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") -if(HAVE_QT) +if(HAVE_QT5) + set(CMAKE_AUTOMOC ON) + set(CMAKE_INCLUDE_CURRENT_DIR ON) + + QT5_ADD_RESOURCES(_RCC_OUTFILES src/window_QT.qrc) + list(APPEND highgui_srcs src/window_QT.cpp src/window_QT.h ${_RCC_OUTFILES}) + + foreach(dt5_dep Core Gui Widgets Test Concurrent) + add_definitions(${Qt5${dt5_dep}_DEFINITIONS}) + include_directories(${Qt5${dt5_dep}_INCLUDE_DIRS}) + list(APPEND HIGHGUI_LIBRARIES ${Qt5${dt5_dep}_LIBRARIES}) + endforeach() + + if(HAVE_QT_OPENGL) + add_definitions(${Qt5OpenGL_DEFINITIONS}) + include_directories(${Qt5OpenGL_INCLUDE_DIRS}) + list(APPEND HIGHGUI_LIBRARIES ${Qt5OpenGL_LIBRARIES}) + endif() + +elseif(HAVE_QT) if (HAVE_QT_OPENGL) set(QT_USE_QTOPENGL TRUE) endif() diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h index 66f8bd0e9..5c9fde38d 100644 --- a/modules/highgui/include/opencv2/highgui/highgui_c.h +++ b/modules/highgui/include/opencv2/highgui/highgui_c.h @@ -306,6 +306,8 @@ enum CV_CAP_OPENNI_ASUS =910, // OpenNI (for Asus Xtion) CV_CAP_ANDROID =1000, // Android + CV_CAP_ANDROID_BACK =CV_CAP_ANDROID+99, // Android back camera + CV_CAP_ANDROID_FRONT =CV_CAP_ANDROID+98, // Android front camera CV_CAP_XIAPI =1100, // XIMEA Camera API diff --git a/modules/highgui/src/cap_libv4l.cpp b/modules/highgui/src/cap_libv4l.cpp index ec048aff7..b081621b1 100644 --- a/modules/highgui/src/cap_libv4l.cpp +++ b/modules/highgui/src/cap_libv4l.cpp @@ -1665,6 +1665,17 @@ static int icvSetPropertyCAM_V4L(CvCaptureCAM_V4L* capture, int property_id, dou width = height = 0; } break; + case CV_CAP_PROP_FPS: + struct v4l2_streamparm setfps; + memset (&setfps, 0, sizeof(struct v4l2_streamparm)); + setfps.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + setfps.parm.capture.timeperframe.numerator = 1; + setfps.parm.capture.timeperframe.denominator = value; + if (xioctl (capture->deviceHandle, VIDIOC_S_PARM, &setfps) < 0){ + fprintf(stderr, "HIGHGUI ERROR: V4L: Unable to set camera FPS\n"); + retval=0; + } + break; default: retval = icvSetControl(capture, property_id, value); } diff --git a/modules/highgui/src/grfmt_jpeg.cpp b/modules/highgui/src/grfmt_jpeg.cpp index 7054d6dbc..09db677a4 100644 --- a/modules/highgui/src/grfmt_jpeg.cpp +++ b/modules/highgui/src/grfmt_jpeg.cpp @@ -52,6 +52,11 @@ #include #include +// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp +// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details +#define mingw_getsp(...) 0 +#define __builtin_frame_address(...) 0 + #ifdef WIN32 #define XMD_H // prevent redefinition of INT32 diff --git a/modules/highgui/src/grfmt_png.cpp b/modules/highgui/src/grfmt_png.cpp index 77b317f0f..eb68ec892 100644 --- a/modules/highgui/src/grfmt_png.cpp +++ b/modules/highgui/src/grfmt_png.cpp @@ -73,6 +73,11 @@ #pragma warning( disable: 4611 ) #endif +// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp +// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details +#define mingw_getsp(...) 0 +#define __builtin_frame_address(...) 0 + namespace cv { diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h index 1b07442dc..089997f51 100644 --- a/modules/highgui/src/window_QT.h +++ b/modules/highgui/src/window_QT.h @@ -48,13 +48,13 @@ #endif #include -#include +#include #include #include -#include +#include #include #include -#include +#include #include #include #include @@ -78,7 +78,7 @@ #include #include #include -#include +#include //start private enum enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 }; diff --git a/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst b/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst index 04e29ad9a..6f7cba3a9 100644 --- a/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst +++ b/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst @@ -342,7 +342,7 @@ Finds the convex hull of a point set. :param hull_storage: Output memory storage in the old API (``cvConvexHull2`` returns a sequence containing the convex hull points or their indices). - :param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The usual screen coordinate system is assumed so that the origin is at the top-left corner, x axis is oriented to the right, and y axis is oriented downwards. + :param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The assumed coordinate system has its X axis pointing to the right, and its Y axis pointing upwards. :param orientation: Convex hull orientation parameter in the old API, ``CV_CLOCKWISE`` or ``CV_COUNTERCLOCKWISE``. diff --git a/modules/imgproc/src/floodfill.cpp b/modules/imgproc/src/floodfill.cpp index 5436c7b02..40556c48d 100644 --- a/modules/imgproc/src/floodfill.cpp +++ b/modules/imgproc/src/floodfill.cpp @@ -127,7 +127,6 @@ floodFill_CnIR( Mat& image, Point seed, _Tp newVal, ConnectedComp* region, int flags, std::vector* buffer ) { - typedef typename DataType<_Tp>::channel_type _CTp; _Tp* img = (_Tp*)(image.data + image.step * seed.y); Size roi = image.size(); int i, L, R; @@ -279,7 +278,6 @@ floodFillGrad_CnIR( Mat& image, Mat& msk, Diff diff, ConnectedComp* region, int flags, std::vector* buffer ) { - typedef typename DataType<_Tp>::channel_type _CTp; int step = (int)image.step, maskStep = (int)msk.step; uchar* pImage = image.data; _Tp* img = (_Tp*)(pImage + step*seed.y); @@ -610,7 +608,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask, &comp, flags, &buffer); else CV_Error(CV_StsUnsupportedFormat, ""); - + if( rect ) *rect = comp.rect; return comp.area; diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index ba9cd6d7b..05a96300b 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1219,8 +1219,6 @@ static void resizeGeneric_( const Mat& src, Mat& dst, const int* yofs, const void* _beta, int xmin, int xmax, int ksize ) { - typedef typename HResize::value_type T; - typedef typename HResize::buf_type WT; typedef typename HResize::alpha_type AT; const AT* beta = (const AT*)_beta; diff --git a/modules/java/android_lib/lint.xml b/modules/java/android_lib/lint.xml index e54ced1dc..a95e509d2 100644 --- a/modules/java/android_lib/lint.xml +++ b/modules/java/android_lib/lint.xml @@ -1,5 +1,8 @@ + + + diff --git a/modules/java/android_lib/res/values/attrs.xml b/modules/java/android_lib/res/values/attrs.xml index 0cdf1097a..6902621f6 100644 --- a/modules/java/android_lib/res/values/attrs.xml +++ b/modules/java/android_lib/res/values/attrs.xml @@ -4,8 +4,8 @@ - - + + diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java index 36417c582..6c5c3294f 100644 --- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java +++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java @@ -47,10 +47,14 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac protected int mMaxWidth; protected float mScale = 0; protected int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA; - protected int mCameraIndex = -1; + protected int mCameraIndex = CAMERA_ID_ANY; protected boolean mEnabled; protected FpsMeter mFpsMeter = null; + public static final int CAMERA_ID_ANY = -1; + public static final int CAMERA_ID_BACK = 99; + public static final int CAMERA_ID_FRONT = 98; + public CameraBridgeViewBase(Context context, int cameraId) { super(context); mCameraIndex = cameraId; @@ -74,6 +78,7 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac getHolder().addCallback(this); mMaxWidth = MAX_UNSPECIFIED; mMaxHeight = MAX_UNSPECIFIED; + styledAttrs.recycle(); } public interface CvCameraViewListener { @@ -155,8 +160,6 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac mPreviewFormat = format; } - private CvCameraViewListenerAdapter() {} - private int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA; private CvCameraViewListener mOldStyleListener; }; diff --git a/modules/java/generator/src/java/android+JavaCameraView.java b/modules/java/generator/src/java/android+JavaCameraView.java index f07b7d2ca..0dcdad2fb 100644 --- a/modules/java/generator/src/java/android+JavaCameraView.java +++ b/modules/java/generator/src/java/android+JavaCameraView.java @@ -6,6 +6,7 @@ import android.content.Context; import android.graphics.ImageFormat; import android.graphics.SurfaceTexture; import android.hardware.Camera; +import android.hardware.Camera.CameraInfo; import android.hardware.Camera.PreviewCallback; import android.os.Build; import android.util.AttributeSet; @@ -68,7 +69,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb synchronized (this) { mCamera = null; - if (mCameraIndex == -1) { + if (mCameraIndex == CAMERA_ID_ANY) { Log.d(TAG, "Trying to open camera with old open()"); try { mCamera = Camera.open(); @@ -92,11 +93,39 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb } } else { if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) { - Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(mCameraIndex) + ")"); - try { - mCamera = Camera.open(mCameraIndex); - } catch (RuntimeException e) { - Log.e(TAG, "Camera #" + mCameraIndex + "failed to open: " + e.getLocalizedMessage()); + int localCameraIndex = mCameraIndex; + if (mCameraIndex == CAMERA_ID_BACK) { + Log.i(TAG, "Trying to open back camera"); + Camera.CameraInfo cameraInfo = new Camera.CameraInfo(); + for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) { + Camera.getCameraInfo( camIdx, cameraInfo ); + if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) { + localCameraIndex = camIdx; + break; + } + } + } else if (mCameraIndex == CAMERA_ID_FRONT) { + Log.i(TAG, "Trying to open front camera"); + Camera.CameraInfo cameraInfo = new Camera.CameraInfo(); + for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) { + Camera.getCameraInfo( camIdx, cameraInfo ); + if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) { + localCameraIndex = camIdx; + break; + } + } + } + if (localCameraIndex == CAMERA_ID_BACK) { + Log.e(TAG, "Back camera not found!"); + } else if (localCameraIndex == CAMERA_ID_FRONT) { + Log.e(TAG, "Front camera not found!"); + } else { + Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")"); + try { + mCamera = Camera.open(localCameraIndex); + } catch (RuntimeException e) { + Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage()); + } } } } @@ -179,6 +208,8 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb synchronized (this) { if (mCamera != null) { mCamera.stopPreview(); + mCamera.setPreviewCallback(null); + mCamera.release(); } mCamera = null; @@ -267,9 +298,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb mRgba.release(); } - private JavaCameraFrame(CvCameraViewFrame obj) { - } - private Mat mYuvFrameData; private Mat mRgba; private int mWidth; diff --git a/modules/java/generator/src/java/android+NativeCameraView.java b/modules/java/generator/src/java/android+NativeCameraView.java index 496ed53d6..62d077580 100644 --- a/modules/java/generator/src/java/android+NativeCameraView.java +++ b/modules/java/generator/src/java/android+NativeCameraView.java @@ -53,14 +53,16 @@ public class NativeCameraView extends CameraBridgeViewBase { /* 1. We need to stop thread which updating the frames * 2. Stop camera and release it */ - try { - mStopThread = true; - mThread.join(); - } catch (InterruptedException e) { - e.printStackTrace(); - } finally { - mThread = null; - mStopThread = false; + if (mThread != null) { + try { + mStopThread = true; + mThread.join(); + } catch (InterruptedException e) { + e.printStackTrace(); + } finally { + mThread = null; + mStopThread = false; + } } /* Now release camera */ @@ -131,17 +133,17 @@ public class NativeCameraView extends CameraBridgeViewBase { } } - private class NativeCameraFrame implements CvCameraViewFrame { + private static class NativeCameraFrame implements CvCameraViewFrame { @Override public Mat rgba() { - mCamera.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA); + mCapture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA); return mRgba; } @Override public Mat gray() { - mCamera.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME); + mCapture.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME); return mGray; } @@ -158,9 +160,6 @@ public class NativeCameraView extends CameraBridgeViewBase { private class CameraWorker implements Runnable { - private Mat mRgba = new Mat(); - private Mat mGray = new Mat(); - public void run() { do { if (!mCamera.grab()) { diff --git a/modules/legacy/src/blobtrackgenyml.cpp b/modules/legacy/src/blobtrackgenyml.cpp index 5d0e870fe..8a414915c 100644 --- a/modules/legacy/src/blobtrackgenyml.cpp +++ b/modules/legacy/src/blobtrackgenyml.cpp @@ -61,7 +61,7 @@ protected: { int ObjNum = m_TrackList.GetBlobNum(); int i; - char video_name[1024]; + char video_name[1024+1]; char* struct_name = NULL; CvFileStorage* storage = cvOpenFileStorage(m_pFileName,NULL,CV_STORAGE_WRITE_TEXT); diff --git a/modules/legacy/src/kdtree.cpp b/modules/legacy/src/kdtree.cpp index 12a4acd69..a32677b79 100644 --- a/modules/legacy/src/kdtree.cpp +++ b/modules/legacy/src/kdtree.cpp @@ -117,10 +117,10 @@ class CvKDTreeWrap : public CvFeatureTree { CvMat* results) { int rn = results->rows * results->cols; std::vector inbounds; - dispatch_cvtype(mat, ((__treetype*)data)-> - find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr, + assert(CV_MAT_DEPTH(mat->type) == CV_32F || CV_MAT_DEPTH(mat->type) == CV_64F); + ((__treetype*)data)->find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr, (typename __treetype::scalar_type*)bounds_max->data.ptr, - inbounds)); + inbounds); std::copy(inbounds.begin(), inbounds.begin() + std::min((int)inbounds.size(), rn), (int*) results->data.ptr); diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp index 183ab295c..9373f1c1d 100644 --- a/modules/objdetect/src/cascadedetect.cpp +++ b/modules/objdetect/src/cascadedetect.cpp @@ -1140,7 +1140,7 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector& o Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) ); Size scaledImageSize( cvRound( grayImage.cols/factor ), cvRound( grayImage.rows/factor ) ); - Size processingRectSize( scaledImageSize.width - originalWindowSize.width + 1, scaledImageSize.height - originalWindowSize.height + 1 ); + Size processingRectSize( scaledImageSize.width - originalWindowSize.width, scaledImageSize.height - originalWindowSize.height ); if( processingRectSize.width <= 0 || processingRectSize.height <= 0 ) break; diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp index 745953d04..0df96dbf1 100644 --- a/modules/ocl/include/opencv2/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl.hpp @@ -151,7 +151,7 @@ namespace cv static Context *getContext(); static void setContext(Info &oclinfo); - enum {CL_DOUBLE, CL_UNIFIED_MEM}; + enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2}; bool supportsFeature(int ftype); size_t computeUnits(); size_t maxWorkGroupSize(); @@ -264,9 +264,9 @@ namespace cv void create(Size size, int type); //! allocates new oclMatrix with specified device memory type. - void createEx(int rows, int cols, int type, + void createEx(int rows, int cols, int type, DevMemRW rw_type, DevMemType mem_type, void* hptr = 0); - void createEx(Size size, int type, DevMemRW rw_type, + void createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type, void* hptr = 0); //! decreases reference counter; @@ -406,6 +406,9 @@ namespace cv //! computes element-wise product of the two arrays (c = a * b) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1); + //! multiplies matrix to a number (dst = scalar * src) + // supports CV_32FC1 only + CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst); //! computes element-wise quotient of the two arrays (c = a / b) // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4 CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1); @@ -823,7 +826,44 @@ namespace cv }; #endif + class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier + { + public: + OclCascadeClassifierBuf() : + m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {} + ~OclCascadeClassifierBuf() {} + + void detectMultiScale(oclMat &image, CV_OUT std::vector& faces, + double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, + Size minSize = Size(), Size maxSize = Size()); + void release(); + + private: + void Init(const int rows, const int cols, double scaleFactor, int flags, + const int outputsz, const size_t localThreads[], + Size minSize, Size maxSize); + void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz); + void CreateFactorRelatedBufs(const int rows, const int cols, const int flags, + const double scaleFactor, const size_t localThreads[], + Size minSize, Size maxSize); + void GenResult(CV_OUT std::vector& faces, const std::vector &rectList, const std::vector &rweights); + + int m_rows; + int m_cols; + int m_flags; + int m_loopcount; + int m_nodenum; + bool findBiggestObject; + bool initialized; + double m_scaleFactor; + Size m_minSize; + Size m_maxSize; + std::vector sizev; + std::vector scalev; + oclMat gimg1, gsum, gsqsum; + void * buffers; + }; /////////////////////////////// Pyramid ///////////////////////////////////// CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst); @@ -849,7 +889,6 @@ namespace cv std::vector image_sqsums; }; - //! computes the proximity map for the raster template and the image where the template is searched for // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4 // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4 @@ -1093,13 +1132,11 @@ namespace cv /****************************************************************************************\ * Distance * \****************************************************************************************/ - template struct CV_EXPORTS Accumulator { typedef T Type; }; - template<> struct Accumulator { typedef float Type; @@ -1173,469 +1210,244 @@ namespace cv { public: enum DistType {L1Dist = 0, L2Dist, HammingDist}; - explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist); - - // Add descriptors to train descriptor collection - void add(const std::vector &descCollection); - - // Get train descriptors collection - const std::vector &getTrainDescriptors() const; - - // Clear train descriptors collection - void clear(); - - // Return true if there are not train descriptors in collection - bool empty() const; - - // Return true if the matcher supports mask in match methods - bool isMaskSupported() const; - - // Find one best match for each query descriptor - void matchSingle(const oclMat &query, const oclMat &train, - oclMat &trainIdx, oclMat &distance, - const oclMat &mask = oclMat()); - - // Download trainIdx and distance and convert it to CPU vector with DMatch - static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector &matches); - // Convert trainIdx and distance to vector with DMatch - static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector &matches); - - // Find one best match for each query descriptor - void match(const oclMat &query, const oclMat &train, std::vector &matches, const oclMat &mask = oclMat()); - - // Make gpu collection of trains and masks in suitable format for matchCollection function - void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector &masks = std::vector()); - - // Find one best match from train collection for each query descriptor - void matchCollection(const oclMat &query, const oclMat &trainCollection, - oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, - const oclMat &masks = oclMat()); - - // Download trainIdx, imgIdx and distance and convert it to vector with DMatch - static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector &matches); - // Convert trainIdx, imgIdx and distance to vector with DMatch - static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector &matches); - - // Find one best match from train collection for each query descriptor. - void match(const oclMat &query, std::vector &matches, const std::vector &masks = std::vector()); - - // Find k best matches for each query descriptor (in increasing order of distances) - void knnMatchSingle(const oclMat &query, const oclMat &train, - oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k, - const oclMat &mask = oclMat()); - - // Download trainIdx and distance and convert it to vector with DMatch - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, - std::vector< std::vector > &matches, bool compactResult = false); - // Convert trainIdx and distance to vector with DMatch - static void knnMatchConvert(const Mat &trainIdx, const Mat &distance, - std::vector< std::vector > &matches, bool compactResult = false); - - // Find k best matches for each query descriptor (in increasing order of distances). - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - void knnMatch(const oclMat &query, const oclMat &train, - std::vector< std::vector > &matches, int k, const oclMat &mask = oclMat(), - bool compactResult = false); - - // Find k best matches from train collection for each query descriptor (in increasing order of distances) - void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection, - oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, - const oclMat &maskCollection = oclMat()); - - // Download trainIdx and distance and convert it to vector with DMatch - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, - std::vector< std::vector > &matches, bool compactResult = false); - // Convert trainIdx and distance to vector with DMatch - static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, - std::vector< std::vector > &matches, bool compactResult = false); - - // Find k best matches for each query descriptor (in increasing order of distances). - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - void knnMatch(const oclMat &query, std::vector< std::vector > &matches, int k, - const std::vector &masks = std::vector(), bool compactResult = false); - - // Find best matches for each query descriptor which have distance less than maxDistance. - // nMatches.at(0, queryIdx) will contain matches count for queryIdx. - // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, - // because it didn't have enough memory. - // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10), - // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches - // Matches doesn't sorted. - void radiusMatchSingle(const oclMat &query, const oclMat &train, - oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, - const oclMat &mask = oclMat()); - - // Download trainIdx, nMatches and distance and convert it to vector with DMatch. - // matches will be sorted in increasing order of distances. - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, - std::vector< std::vector > &matches, bool compactResult = false); - // Convert trainIdx, nMatches and distance to vector with DMatch. - static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches, - std::vector< std::vector > &matches, bool compactResult = false); - - // Find best matches for each query descriptor which have distance less than maxDistance - // in increasing order of distances). - void radiusMatch(const oclMat &query, const oclMat &train, - std::vector< std::vector > &matches, float maxDistance, - const oclMat &mask = oclMat(), bool compactResult = false); - - // Find best matches for each query descriptor which have distance less than maxDistance. - // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10), - // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches - // Matches doesn't sorted. - void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance, - const std::vector &masks = std::vector()); - - // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch. - // matches will be sorted in increasing order of distances. - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches, - std::vector< std::vector > &matches, bool compactResult = false); - // Convert trainIdx, nMatches and distance to vector with DMatch. - static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches, - std::vector< std::vector > &matches, bool compactResult = false); - - // Find best matches from train collection for each query descriptor which have distance less than - // maxDistance (in increasing order of distances). - void radiusMatch(const oclMat &query, std::vector< std::vector > &matches, float maxDistance, - const std::vector &masks = std::vector(), bool compactResult = false); - - DistType distType; - - private: - std::vector trainDescCollection; - }; - - template - class CV_EXPORTS BruteForceMatcher_OCL; - - template - class CV_EXPORTS BruteForceMatcher_OCL< L1 > : public BruteForceMatcher_OCL_base - { - public: - explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {} - explicit BruteForceMatcher_OCL(L1 /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {} - }; - template - class CV_EXPORTS BruteForceMatcher_OCL< L2 > : public BruteForceMatcher_OCL_base - { - public: - explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {} - explicit BruteForceMatcher_OCL(L2 /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {} - }; - template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base - { - public: - explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {} - explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {} - }; - + class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base + { + public: + explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {} + }; /////////////////////////////// PyrLKOpticalFlow ///////////////////////////////////// class CV_EXPORTS PyrLKOpticalFlow - { - public: - PyrLKOpticalFlow() - { - winSize = Size(21, 21); - maxLevel = 3; - iters = 30; - derivLambda = 0.5; - useInitialFlow = false; - minEigThreshold = 1e-4f; - getMinEigenVals = false; - isDeviceArch11_ = false; - } - - void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts, - oclMat &status, oclMat *err = 0); - - void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0); - - Size winSize; - int maxLevel; - int iters; - double derivLambda; - bool useInitialFlow; - float minEigThreshold; - bool getMinEigenVals; - - void releaseMemory() - { - dx_calcBuf_.release(); - dy_calcBuf_.release(); - - prevPyr_.clear(); - nextPyr_.clear(); - - dx_buf_.release(); - dy_buf_.release(); - } - - private: - void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy); - - void buildImagePyramid(const oclMat &img0, std::vector &pyr, bool withBorder); - - oclMat dx_calcBuf_; - oclMat dy_calcBuf_; - - std::vector prevPyr_; - std::vector nextPyr_; - - oclMat dx_buf_; - oclMat dy_buf_; - - oclMat uPyr_[2]; - oclMat vPyr_[2]; - - bool isDeviceArch11_; - }; //////////////// build warping maps //////////////////// //! builds plane warping maps @@ -1706,6 +1518,7 @@ namespace cv private: oclMat minSSD, leBuf, riBuf; }; + class CV_EXPORTS StereoBeliefPropagation { public: @@ -1736,6 +1549,133 @@ namespace cv std::vector datas; oclMat out; }; + + class CV_EXPORTS StereoConstantSpaceBP + { + public: + enum { DEFAULT_NDISP = 128 }; + enum { DEFAULT_ITERS = 8 }; + enum { DEFAULT_LEVELS = 4 }; + enum { DEFAULT_NR_PLANE = 4 }; + static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane); + explicit StereoConstantSpaceBP( + int ndisp = DEFAULT_NDISP, + int iters = DEFAULT_ITERS, + int levels = DEFAULT_LEVELS, + int nr_plane = DEFAULT_NR_PLANE, + int msg_type = CV_32F); + StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, + float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, + int min_disp_th = 0, + int msg_type = CV_32F); + void operator()(const oclMat &left, const oclMat &right, oclMat &disparity); + int ndisp; + int iters; + int levels; + int nr_plane; + float max_data_term; + float data_weight; + float max_disc_term; + float disc_single_jump; + int min_disp_th; + int msg_type; + bool use_local_init_data_cost; + private: + oclMat u[2], d[2], l[2], r[2]; + oclMat disp_selected_pyr[2]; + oclMat data_cost; + oclMat data_cost_selected; + oclMat temp; + oclMat out; + }; + + // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method + // + // see reference: + // [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". + // [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". + class CV_EXPORTS OpticalFlowDual_TVL1_OCL + { + public: + OpticalFlowDual_TVL1_OCL(); + + void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy); + + void collectGarbage(); + + /** + * Time step of the numerical scheme. + */ + double tau; + + /** + * Weight parameter for the data term, attachment parameter. + * This is the most relevant parameter, which determines the smoothness of the output. + * The smaller this parameter is, the smoother the solutions we obtain. + * It depends on the range of motions of the images, so its value should be adapted to each image sequence. + */ + double lambda; + + /** + * Weight parameter for (u - v)^2, tightness parameter. + * It serves as a link between the attachment and the regularization terms. + * In theory, it should have a small value in order to maintain both parts in correspondence. + * The method is stable for a large range of values of this parameter. + */ + double theta; + + /** + * Number of scales used to create the pyramid of images. + */ + int nscales; + + /** + * Number of warpings per scale. + * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale. + * This is a parameter that assures the stability of the method. + * It also affects the running time, so it is a compromise between speed and accuracy. + */ + int warps; + + /** + * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time. + * A small value will yield more accurate solutions at the expense of a slower convergence. + */ + double epsilon; + + /** + * Stopping criterion iterations number used in the numerical scheme. + */ + int iterations; + + bool useInitialFlow; + + private: + void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2); + + std::vector I0s; + std::vector I1s; + std::vector u1s; + std::vector u2s; + + oclMat I1x_buf; + oclMat I1y_buf; + + oclMat I1w_buf; + oclMat I1wx_buf; + oclMat I1wy_buf; + + oclMat grad_buf; + oclMat rho_c_buf; + + oclMat p11_buf; + oclMat p12_buf; + oclMat p21_buf; + oclMat p22_buf; + + oclMat diff_buf; + oclMat norm_buf; + }; } } #if defined _MSC_VER && _MSC_VER >= 1200 diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index eb8ff427d..3dd46545a 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -45,4 +45,4 @@ #error this is a compatibility header which should not be used inside the OpenCV library #endif -#include "opencv2/ocl.hpp" \ No newline at end of file +#include "opencv2/ocl.hpp" diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index f7024fffa..d425344d8 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -22,6 +22,7 @@ // Jiang Liyuan, jlyuan001.good@163.com // Rock Li, Rock.Li@amd.com // Zailong Wu, bullet@yeah.net +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -61,8 +62,6 @@ namespace cv namespace ocl { ////////////////////////////////OpenCL kernel strings///////////////////// - extern const char *bitwise; - extern const char *bitwiseM; extern const char *transpose_kernel; extern const char *arithm_nonzero; extern const char *arithm_sum; @@ -76,19 +75,11 @@ namespace cv extern const char *arithm_add; extern const char *arithm_add_scalar; extern const char *arithm_add_scalar_mask; + extern const char *arithm_bitwise_binary; + extern const char *arithm_bitwise_binary_mask; + extern const char *arithm_bitwise_binary_scalar; + extern const char *arithm_bitwise_binary_scalar_mask; extern const char *arithm_bitwise_not; - extern const char *arithm_bitwise_and; - extern const char *arithm_bitwise_and_mask; - extern const char *arithm_bitwise_and_scalar; - extern const char *arithm_bitwise_and_scalar_mask; - extern const char *arithm_bitwise_or; - extern const char *arithm_bitwise_or_mask; - extern const char *arithm_bitwise_or_scalar; - extern const char *arithm_bitwise_or_scalar_mask; - extern const char *arithm_bitwise_xor; - extern const char *arithm_bitwise_xor_mask; - extern const char *arithm_bitwise_xor_scalar; - extern const char *arithm_bitwise_xor_scalar_mask; extern const char *arithm_compare_eq; extern const char *arithm_compare_ne; extern const char *arithm_mul; @@ -126,7 +117,7 @@ inline int divUp(int total, int grain) /////////////////////// add subtract multiply divide ///////////////////////// ////////////////////////////////////////////////////////////////////////////// template -void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, +void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString, void *_scalar, int op_type = 0) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) @@ -195,12 +186,12 @@ void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); } } -static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, +static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString, int op_type = 0) { arithmetic_run(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type); } -static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, +static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int op_type = 0) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) @@ -295,6 +286,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub else arithmetic_run(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar)); } + void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) { @@ -479,6 +471,11 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1); } +void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst) +{ + String kernelName = "arithm_muls"; + arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar); +} void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst) { if(!src.clCxt->supportsFeature(Context::CL_DOUBLE)) @@ -1647,7 +1644,8 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, String kernelName, cons template -void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString, void *_scalar) +void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, + const char **kernelString, void *_scalar, const char* _opt = NULL) { dst.create(src1.size(), src1.type()); CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && @@ -1697,13 +1695,15 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String ker args.push_back( std::make_pair( sizeof(T), (void *)&scalar )); } - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, _opt); } -static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString) +static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, + String kernelName, const char **kernelString, const char* _opt = NULL) { - bitwise_run(src1, src2, dst, kernelName, kernelString, (void *)NULL); + bitwise_run(src1, src2, dst, kernelName, kernelString, (void *)NULL, _opt); } -static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString) +static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, + const oclMat &mask, String kernelName, const char **kernelString, const char* _opt = NULL) { dst.create(src1.size(), src1.type()); CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && @@ -1751,12 +1751,13 @@ static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, con args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 )); - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth); + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, _opt); } template -void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar) +void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, + const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt = NULL) { dst.create(src1.size(), src1.type()); @@ -1818,14 +1819,16 @@ void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, con args.push_back( std::make_pair( sizeof(cl_int) , (void *)&isMatSubScalar)); } - openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth); + openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, opt); } -typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar); +typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, + const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt); -static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar) +static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, + const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt) { static BitwiseFuncS tab[8] = { @@ -1853,11 +1856,12 @@ static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, BitwiseFuncS func = tab[src1.depth()]; if(func == 0) cv::error(Error::StsBadArg, "Unsupported arithmetic operation", "", __FILE__, __LINE__); - func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar); + func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar, opt); } -static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString) +static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, + const oclMat &mask, String kernelName, const char **kernelString, const char * opt = NULL) { - bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0); + bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0, opt); } void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) @@ -1880,12 +1884,13 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co std::cout << "Selected device do not support double" << std::endl; return; } - oclMat emptyMat; - String kernelName = mask.empty() ? "arithm_bitwise_or" : "arithm_bitwise_or_with_mask"; + + String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask"; + static const char opt [] = "-D OP_BINARY=|"; if (mask.empty()) - bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_or); + bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt); else - bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_or_mask); + bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt); } @@ -1896,11 +1901,12 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co std::cout << "Selected device do not support double" << std::endl; return; } - String kernelName = mask.data ? "arithm_s_bitwise_or_with_mask" : "arithm_s_bitwise_or"; + static const char opt [] = "-D OP_BINARY=|"; + String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary"; if (mask.data) - bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_or_scalar_mask); + bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt); else - bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_or_scalar); + bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt); } void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) @@ -1913,12 +1919,13 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c } oclMat emptyMat; - String kernelName = mask.empty() ? "arithm_bitwise_and" : "arithm_bitwise_and_with_mask"; + String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask"; + static const char opt [] = "-D OP_BINARY=&"; if (mask.empty()) - bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_and); + bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt); else - bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_mask); + bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt); } void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) @@ -1928,11 +1935,12 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c std::cout << "Selected device do not support double" << std::endl; return; } - String kernelName = mask.data ? "arithm_s_bitwise_and_with_mask" : "arithm_s_bitwise_and"; + static const char opt [] = "-D OP_BINARY=&"; + String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary"; if (mask.data) - bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_scalar_mask); + bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt); else - bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_scalar); + bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt); } void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) @@ -1942,14 +1950,14 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c std::cout << "Selected device do not support double" << std::endl; return; } - oclMat emptyMat; - String kernelName = mask.empty() ? "arithm_bitwise_xor" : "arithm_bitwise_xor_with_mask"; + String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask"; + static const char opt [] = "-D OP_BINARY=^"; if (mask.empty()) - bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_xor); + bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt); else - bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_mask); + bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt); } @@ -1961,11 +1969,12 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, c std::cout << "Selected device do not support double" << std::endl; return; } - String kernelName = mask.data ? "arithm_s_bitwise_xor_with_mask" : "arithm_s_bitwise_xor"; + String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary"; + static const char opt [] = "-D OP_BINARY=^"; if (mask.data) - bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar_mask); + bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt); else - bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar); + bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt); } oclMat cv::ocl::operator ~ (const oclMat &src) diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp index 1834e7bc1..e8e0e588c 100644 --- a/modules/ocl/src/brute_force_matcher.cpp +++ b/modules/ocl/src/brute_force_matcher.cpp @@ -844,8 +844,8 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &quer if (query.empty() || trainCollection.empty()) return; - typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks, - const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance); + // typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks, + // const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance); CV_Assert(query.channels() == 1 && query.depth() < CV_64F); @@ -992,7 +992,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec // radiusMatchSingle void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train, - oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask) + oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask) { if (query.empty() || train.empty()) return; @@ -1094,9 +1094,9 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &qu if (query.empty() || empty()) return; +#if 0 typedef void (*caller_t)(const oclMat & query, const oclMat * trains, int n, float maxDistance, const oclMat * masks, const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance, const oclMat & nMatches); -#if 0 static const caller_t callers[3][6] = { { diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp index 11bb375f0..0867023a7 100644 --- a/modules/ocl/src/gemm.cpp +++ b/modules/ocl/src/gemm.cpp @@ -60,7 +60,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha, const oclMat &src3, double beta, oclMat &dst, int flags) { CV_Assert(src1.cols == src2.rows && - (src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols)); + (src3.empty() || (src1.rows == src3.rows && src2.cols == src3.cols))); CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported if(!src3.empty()) { diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 5206f96bc..09e1816e3 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -20,6 +20,7 @@ // Jia Haipeng, jiahaipeng95@gmail.com // Wu Xinglong, wxl370@126.com // Wang Yao, bitwangyaoyao@gmail.com +// Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -144,7 +145,8 @@ typedef struct int imgoff; float factor; } detect_piramid_info; -#ifdef WIN32 + +#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__ #define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT)) typedef _ALIGNED_ON(128) struct GpuHidHaarFeature { @@ -841,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade } /* j */ } } + CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor, int minNeighbors, int flags, CvSize minSize, CvSize maxSize) { CvHaarClassifierCascade *cascade = oldCascade; - //double alltime = (double)cvGetTickCount(); - //double t = (double)cvGetTickCount(); const double GROUP_EPS = 0.2; - oclMat gtemp, gsum1, gtilted1, gsqsum1, gnormImg, gsumcanny; CvSeq *result_seq = 0; cv::Ptr temp_storage; @@ -860,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int datasize=0; int totalclassifier=0; - //void *out; GpuHidHaarClassifierCascade *gcascade; GpuHidHaarStageClassifier *stage; GpuHidHaarClassifier *classifier; @@ -869,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int *candidate; cl_int status; - // bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0; bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; - // bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0; - //double t = 0; if( maxSize.height == 0 || maxSize.width == 0 ) { maxSize.height = gimg.rows; @@ -895,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( findBiggestObject ) flags &= ~CV_HAAR_SCALE_IMAGE; - //gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1); - //gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 ); - //gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 ); - if( !cascade->hid_cascade ) - /*out = (void *)*/gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier); - if( cascade->hid_cascade->has_tilted_features ) - gtilted1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 ); + gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier); result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage ); if( CV_MAT_CN(gimg.type()) > 1 ) { + oclMat gtemp; cvtColor( gimg, gtemp, COLOR_BGR2GRAY ); gimg = gtemp; } if( findBiggestObject ) flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING); - //t = (double)cvGetTickCount() - t; - //printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); if( gimg.cols < minSize.width || gimg.rows < minSize.height ) CV_Error(CV_StsError, "Image too small"); @@ -923,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( (flags & CV_HAAR_SCALE_IMAGE) ) { CvSize winSize0 = cascade->orig_window_size; - //float scalefactor = 1.1f; - //float factor = 1.f; int totalheight = 0; int indexy = 0; CvSize sz; - //t = (double)cvGetTickCount(); std::vector sizev; std::vector scalev; for(factor = 1.f;; factor *= scaleFactor) @@ -949,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS sizev.push_back(sz); scalev.push_back(factor); } - //int flag = 0; oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1); oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1); oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1); - //cl_mem cascadebuffer; cl_mem stagebuffer; - //cl_mem classifierbuffer; cl_mem nodebuffer; cl_mem candidatebuffer; cl_mem scaleinfobuffer; - //cl_kernel kernel; - //kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade"); cv::Rect roi, roi2; cv::Mat imgroi, imgroisq; cv::ocl::oclMat resizeroi, gimgroi, gimgroisq; @@ -970,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t blocksize = 8; size_t localThreads[3] = { blocksize, blocksize , 1 }; - size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0], + size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0], localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; int loopcount = sizev.size(); detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount); - //t = (double)cvGetTickCount() - t; - // printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //int *it =scaleinfo; - // t = (double)cvGetTickCount(); - for( int i = 0; i < loopcount; i++ ) { sz = sizev[i]; @@ -991,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS resizeroi = gimg1(roi2); gimgroi = gsum(roi); gimgroisq = gsqsum(roi); - //scaleinfo[i].rows = gimgroi.rows; int width = gimgroi.cols - 1 - cascade->orig_window_size.width; int height = gimgroi.rows - 1 - cascade->orig_window_size.height; scaleinfo[i].width_height = (width << 16) | height; @@ -999,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; - //outputsz +=width*height; scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; scaleinfo[i].imgoff = gimgroi.offset >> 2; scaleinfo[i].factor = factor; - //printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n", - // scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline, - // scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor); cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR); - //cv::imwrite("D:\\1.jpg",gimg1); cv::ocl::integral(resizeroi, gimgroi, gimgroisq); - //cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1); - //cv::ocl::integral(gimg1, chk, chksq); - //double r = cv::norm(chk,gimgroi,NORM_INF); - //if(r > std::numeric_limits::epsilon()) - //{ - // printf("failed"); - //} indexy += sz.height; } - //int ystep = factor > 2 ? 1 : 2; - // t = (double)cvGetTickCount() - t; - //printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //t = (double)cvGetTickCount(); + gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade; stage = (GpuHidHaarStageClassifier *)(gcascade + 1); classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); node = (GpuHidHaarTreeNode *)(classifier->node); - //int m,n; - //m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep; - //n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep; - //int counter = m*n; - int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode); - //if(flag == 0){ - candidate = (int *)malloc(4 * sizeof(int) * outputsz); - //memset((char*)candidate,0,4*sizeof(int)*outputsz); - gpuSetImagesForHaarClassifierCascade( cascade,/* &sum1, &sqsum1, _tilted,*/ 1., gsum.step / 4 ); - //cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status); - //openCLVerifyCall(status); - //openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL)); + candidate = (int *)malloc(4 * sizeof(int) * outputsz); + + gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 ); stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); - - //classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status); - //status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL); + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode)); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0, - nodenum * sizeof(GpuHidHaarTreeNode), + + openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode), node, 0, NULL, NULL)); candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz); - //openCLVerifyCall(status); + scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); - //flag = 1; - //} + openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); - //t = (double)cvGetTickCount() - t; - //printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - - //size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1}; - //t = (double)cvGetTickCount(); int startstage = 0; int endstage = gcascade->count; int startnode = 0; @@ -1086,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS pq.s[3] = gcascade->pq3; float correction = gcascade->inv_window_area; - //int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]); - //int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline; - // openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads); - //openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer)); - std::vector > args; args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer )); args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer )); @@ -1110,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction )); openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1); - //t = (double)cvGetTickCount() - t; - //printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //t = (double)cvGetTickCount(); - //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL)); + openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); for(int i = 0; i < outputsz; i++) if(candidate[4 * i + 2] != 0) - allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3])); - // t = (double)cvGetTickCount() - t; - //printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //t = (double)cvGetTickCount(); + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + free(scaleinfo); free(candidate); - //openCLSafeCall(clReleaseMemObject(cascadebuffer)); openCLSafeCall(clReleaseMemObject(stagebuffer)); openCLSafeCall(clReleaseMemObject(scaleinfobuffer)); openCLSafeCall(clReleaseMemObject(nodebuffer)); openCLSafeCall(clReleaseMemObject(candidatebuffer)); - // openCLSafeCall(clReleaseKernel(kernel)); - //t = (double)cvGetTickCount() - t; - //printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); } else { @@ -1149,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); node = (GpuHidHaarTreeNode *)(classifier->node); cl_mem stagebuffer; - //cl_mem classifierbuffer; cl_mem nodebuffer; cl_mem candidatebuffer; cl_mem scaleinfobuffer; @@ -1184,24 +1109,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t blocksize = 8; size_t localThreads[3] = { blocksize, blocksize , 1 }; size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0], - localThreads[1], 1 - }; + localThreads[1], 1 }; int outputsz = 256 * globalThreads[0] / localThreads[0]; int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode); nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode)); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0, + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode), node, 0, NULL, NULL)); cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE, loopcount * nodenum * sizeof(GpuHidHaarTreeNode)); int startstage = 0; int endstage = gcascade->count; - //cl_kernel kernel; - //kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2"); - //cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier"); for(int i = 0; i < loopcount; i++) { sz = sizev[i]; @@ -1220,7 +1141,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep; int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; - //outputsz +=width*height; + scaleinfo[i].width_height = (width << 16) | height; scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; scaleinfo[i].imgoff = 0; @@ -1238,28 +1159,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS size_t globalThreads2[3] = {nodenum, 1, 1}; openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1); - - //clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL); - //clFinish(gsum.clCxt->impl->clCmdQueue); } - //clReleaseKernel(kernel2); + int step = gsum.step / 4; int startnode = 0; int splitstage = 3; - int splitnode = stage[0].count + stage[1].count + stage[2].count; stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz); - //openCLVerifyCall(status); scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); - //openCLVerifyCall(status); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL)); pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL)); correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount); - openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL)); - //int argcount = 0; + openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL)); std::vector > args; args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer )); @@ -1268,22 +1181,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data )); args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.rows )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.cols )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&step )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode )); - args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode )); args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&pbuffer )); args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer )); args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum )); - openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1); - //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL)); - candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status); + candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status); for(int i = 0; i < outputsz; i++) { @@ -1294,7 +1206,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS free(scaleinfo); free(p); free(correction); - clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0); + clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0); openCLSafeCall(clReleaseMemObject(stagebuffer)); openCLSafeCall(clReleaseMemObject(scaleinfobuffer)); openCLSafeCall(clReleaseMemObject(nodebuffer)); @@ -1303,20 +1215,547 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS openCLSafeCall(clReleaseMemObject(pbuffer)); openCLSafeCall(clReleaseMemObject(correctionbuffer)); } - //t = (double)cvGetTickCount() ; + cvFree(&cascade->hid_cascade); - // printf("%d\n",globalcounter); rectList.resize(allCandidates.size()); if(!allCandidates.empty()) std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin()); - //cout << "count = " << rectList.size()<< endl; - if( minNeighbors != 0 || findBiggestObject ) groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS); else rweights.resize(rectList.size(), 0); + if( findBiggestObject && rectList.size() ) + { + CvAvgComp result_comp = {{0, 0, 0, 0}, 0}; + + for( size_t i = 0; i < rectList.size(); i++ ) + { + cv::Rect r = rectList[i]; + if( r.area() > cv::Rect(result_comp.rect).area() ) + { + result_comp.rect = r; + result_comp.neighbors = rweights[i]; + } + } + cvSeqPush( result_seq, &result_comp ); + } + else + { + for( size_t i = 0; i < rectList.size(); i++ ) + { + CvAvgComp c; + c.rect = rectList[i]; + c.neighbors = rweights[i]; + cvSeqPush( result_seq, &c ); + } + } + + return result_seq; +} + +struct OclBuffers +{ + cl_mem stagebuffer; + cl_mem nodebuffer; + cl_mem candidatebuffer; + cl_mem scaleinfobuffer; + cl_mem pbuffer; + cl_mem correctionbuffer; + cl_mem newnodebuffer; +}; + +struct getRect +{ + Rect operator()(const CvAvgComp &e) const + { + return e.rect; + } +}; + +void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std::vector& faces, + double scaleFactor, int minNeighbors, int flags, + Size minSize, Size maxSize) +{ + int blocksize = 8; + int grp_per_CU = 12; + size_t localThreads[3] = { blocksize, blocksize, 1 }; + size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0], + localThreads[1], + 1 }; + int outputsz = 256 * globalThreads[0] / localThreads[0]; + + Init(gimg.rows, gimg.cols, scaleFactor, flags, outputsz, localThreads, minSize, maxSize); + + const double GROUP_EPS = 0.2; + + cv::ConcurrentRectVector allCandidates; + std::vector rectList; + std::vector rweights; + + CvHaarClassifierCascade *cascade = oldCascade; + GpuHidHaarClassifierCascade *gcascade; + GpuHidHaarStageClassifier *stage; + GpuHidHaarClassifier *classifier; + GpuHidHaarTreeNode *node; + + if( CV_MAT_DEPTH(gimg.type()) != CV_8U ) + CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" ); + + if( CV_MAT_CN(gimg.type()) > 1 ) + { + oclMat gtemp; + cvtColor( gimg, gtemp, CV_BGR2GRAY ); + gimg = gtemp; + } + + int *candidate; + + if( (flags & CV_HAAR_SCALE_IMAGE) ) + { + int indexy = 0; + CvSize sz; + + cv::Rect roi, roi2; + cv::Mat imgroi, imgroisq; + cv::ocl::oclMat resizeroi, gimgroi, gimgroisq; + + for( int i = 0; i < m_loopcount; i++ ) + { + sz = sizev[i]; + roi = Rect(0, indexy, sz.width, sz.height); + roi2 = Rect(0, 0, sz.width - 1, sz.height - 1); + resizeroi = gimg1(roi2); + gimgroi = gsum(roi); + gimgroisq = gsqsum(roi); + + cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR); + cv::ocl::integral(resizeroi, gimgroi, gimgroisq); + indexy += sz.height; + } + + gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade); + stage = (GpuHidHaarStageClassifier *)(gcascade + 1); + classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); + node = (GpuHidHaarTreeNode *)(classifier->node); + + gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 ); + + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, + sizeof(GpuHidHaarStageClassifier) * gcascade->count, + stage, 0, NULL, NULL)); + + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0, + m_nodenum * sizeof(GpuHidHaarTreeNode), + node, 0, NULL, NULL)); + + int startstage = 0; + int endstage = gcascade->count; + int startnode = 0; + int pixelstep = gsum.step / 4; + int splitstage = 3; + int splitnode = stage[0].count + stage[1].count + stage[2].count; + cl_int4 p, pq; + p.s[0] = gcascade->p0; + p.s[1] = gcascade->p1; + p.s[2] = gcascade->p2; + p.s[3] = gcascade->p3; + pq.s[0] = gcascade->pq0; + pq.s[1] = gcascade->pq1; + pq.s[2] = gcascade->pq2; + pq.s[3] = gcascade->pq3; + float correction = gcascade->inv_window_area; + + vector > args; + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode )); + args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p )); + args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq )); + args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction )); + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1); + + candidate = (int *)malloc(4 * sizeof(int) * outputsz); + memset(candidate, 0, 4 * sizeof(int) * outputsz); + openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + + for(int i = 0; i < outputsz; i++) + if(candidate[4 * i + 2] != 0) + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + + free((void *)candidate); + candidate = NULL; + } + else + { + cv::ocl::integral(gimg, gsum, gsqsum); + + gpuSetHaarClassifierCascade(cascade); + + gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade; + stage = (GpuHidHaarStageClassifier *)(gcascade + 1); + classifier = (GpuHidHaarClassifier *)(stage + gcascade->count); + node = (GpuHidHaarTreeNode *)(classifier->node); + + cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue(); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0, + m_nodenum * sizeof(GpuHidHaarTreeNode), + node, 0, NULL, NULL)); + + cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount); + float *correction = (float *)malloc(sizeof(float) * m_loopcount); + int startstage = 0; + int endstage = gcascade->count; + double factor; + for(int i = 0; i < m_loopcount; i++) + { + factor = scalev[i]; + int equRect_x = (int)(factor * gcascade->p0 + 0.5); + int equRect_y = (int)(factor * gcascade->p1 + 0.5); + int equRect_w = (int)(factor * gcascade->p3 + 0.5); + int equRect_h = (int)(factor * gcascade->p2 + 0.5); + p[i].s[0] = equRect_x; + p[i].s[1] = equRect_y; + p[i].s[2] = equRect_x + equRect_w; + p[i].s[3] = equRect_y + equRect_h; + correction[i] = 1. / (equRect_w * equRect_h); + int startnodenum = m_nodenum * i; + float factor2 = (float)factor; + + vector > args1; + args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer )); + args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer )); + args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 )); + args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] )); + args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum )); + + size_t globalThreads2[3] = {m_nodenum, 1, 1}; + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1); + } + + int step = gsum.step / 4; + int startnode = 0; + int splitstage = 3; + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL)); + openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL)); + + vector > args; + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&step )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer )); + args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer )); + args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum )); + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1); + + candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL); + + for(int i = 0; i < outputsz; i++) + { + if(candidate[4 * i + 2] != 0) + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + } + + free(p); + free(correction); + clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0); + } + + rectList.resize(allCandidates.size()); + if(!allCandidates.empty()) + std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin()); + + if( minNeighbors != 0 || findBiggestObject ) + groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS); + else + rweights.resize(rectList.size(), 0); + + GenResult(faces, rectList, rweights); +} + +void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols, + double scaleFactor, int flags, + const int outputsz, const size_t localThreads[], + CvSize minSize, CvSize maxSize) +{ + CvHaarClassifierCascade *cascade = oldCascade; + + if( !CV_IS_HAAR_CLASSIFIER(cascade) ) + CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" ); + + if( scaleFactor <= 1 ) + CV_Error( CV_StsOutOfRange, "scale factor must be > 1" ); + + if( cols < minSize.width || rows < minSize.height ) + CV_Error(CV_StsError, "Image too small"); + + int datasize=0; + int totalclassifier=0; + + if( !cascade->hid_cascade ) + gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier); + + if( maxSize.height == 0 || maxSize.width == 0 ) + { + maxSize.height = rows; + maxSize.width = cols; + } + + findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; + if( findBiggestObject ) + flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING); + + CreateBaseBufs(datasize, totalclassifier, flags, outputsz); + CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize); + + m_scaleFactor = scaleFactor; + m_rows = rows; + m_cols = cols; + m_flags = flags; + m_minSize = minSize; + m_maxSize = maxSize; + + initialized = true; +} + +void cv::ocl::OclCascadeClassifierBuf::CreateBaseBufs(const int datasize, const int totalclassifier, + const int flags, const int outputsz) +{ + if (!initialized) + { + buffers = malloc(sizeof(OclBuffers)); + + size_t tempSize = + sizeof(GpuHidHaarStageClassifier) * ((GpuHidHaarClassifierCascade *)oldCascade->hid_cascade)->count; + m_nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - tempSize - sizeof(GpuHidHaarClassifier) * totalclassifier) + / sizeof(GpuHidHaarTreeNode); + + ((OclBuffers *)buffers)->stagebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, tempSize); + ((OclBuffers *)buffers)->nodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, m_nodenum * sizeof(GpuHidHaarTreeNode)); + } + + if (initialized + && ((m_flags & CV_HAAR_SCALE_IMAGE) ^ (flags & CV_HAAR_SCALE_IMAGE))) + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer)); + } + + if (flags & CV_HAAR_SCALE_IMAGE) + { + ((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), + CL_MEM_WRITE_ONLY, + 4 * sizeof(int) * outputsz); + } + else + { + ((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), + CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, + 4 * sizeof(int) * outputsz); + } +} + +void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs( + const int rows, const int cols, const int flags, + const double scaleFactor, const size_t localThreads[], + CvSize minSize, CvSize maxSize) +{ + if (initialized) + { + if ((m_flags & CV_HAAR_SCALE_IMAGE) && !(flags & CV_HAAR_SCALE_IMAGE)) + { + gimg1.release(); + gsum.release(); + gsqsum.release(); + } + else if (!(m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE)) + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer)); + } + else if ((m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE)) + { + if (fabs(m_scaleFactor - scaleFactor) < 1e-6 + && (rows == m_rows && cols == m_cols) + && (minSize.width == m_minSize.width) + && (minSize.height == m_minSize.height) + && (maxSize.width == m_maxSize.width) + && (maxSize.height == m_maxSize.height)) + { + return; + } + } + else + { + if (fabs(m_scaleFactor - scaleFactor) < 1e-6 + && (rows == m_rows && cols == m_cols) + && (minSize.width == m_minSize.width) + && (minSize.height == m_minSize.height) + && (maxSize.width == m_maxSize.width) + && (maxSize.height == m_maxSize.height)) + { + return; + } + else + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer)); + } + } + } + + int loopcount; + int indexy = 0; + int totalheight = 0; + double factor; + Rect roi; + CvSize sz; + CvSize winSize0 = oldCascade->orig_window_size; + detect_piramid_info *scaleinfo; + if (flags & CV_HAAR_SCALE_IMAGE) + { + for(factor = 1.f;; factor *= scaleFactor) + { + CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) }; + sz.width = cvRound( cols / factor ) + 1; + sz.height = cvRound( rows / factor ) + 1; + CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 }; + + if( sz1.width <= 0 || sz1.height <= 0 ) + break; + if( winSize.width > maxSize.width || winSize.height > maxSize.height ) + break; + if( winSize.width < minSize.width || winSize.height < minSize.height ) + continue; + + totalheight += sz.height; + sizev.push_back(sz); + scalev.push_back(static_cast(factor)); + } + + loopcount = sizev.size(); + gimg1.create(rows, cols, CV_8UC1); + gsum.create(totalheight + 4, cols + 1, CV_32SC1); + gsqsum.create(totalheight + 4, cols + 1, CV_32FC1); + + scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount); + for( int i = 0; i < loopcount; i++ ) + { + sz = sizev[i]; + roi = Rect(0, indexy, sz.width, sz.height); + int width = sz.width - 1 - oldCascade->orig_window_size.width; + int height = sz.height - 1 - oldCascade->orig_window_size.height; + int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; + int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; + + ((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height; + ((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; + ((detect_piramid_info *)scaleinfo)[i].imgoff = gsum(roi).offset >> 2; + ((detect_piramid_info *)scaleinfo)[i].factor = scalev[i]; + + indexy += sz.height; + } + } + else + { + for(factor = 1; + cvRound(factor * winSize0.width) < cols - 10 && cvRound(factor * winSize0.height) < rows - 10; + factor *= scaleFactor) + { + CvSize winSize = { cvRound( winSize0.width * factor ), cvRound( winSize0.height * factor ) }; + if( winSize.width < minSize.width || winSize.height < minSize.height ) + { + continue; + } + sizev.push_back(winSize); + scalev.push_back(factor); + } + + loopcount = scalev.size(); + if(loopcount == 0) + { + loopcount = 1; + sizev.push_back(minSize); + scalev.push_back( min(cvRound(minSize.width / winSize0.width), cvRound(minSize.height / winSize0.height)) ); + } + + ((OclBuffers *)buffers)->pbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, + sizeof(cl_int4) * loopcount); + ((OclBuffers *)buffers)->correctionbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, + sizeof(cl_float) * loopcount); + ((OclBuffers *)buffers)->newnodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_WRITE, + loopcount * m_nodenum * sizeof(GpuHidHaarTreeNode)); + + scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount); + for( int i = 0; i < loopcount; i++ ) + { + sz = sizev[i]; + factor = scalev[i]; + int ystep = cvRound(std::max(2., factor)); + int width = (cols - 1 - sz.width + ystep - 1) / ystep; + int height = (rows - 1 - sz.height + ystep - 1) / ystep; + int grpnumperline = (width + localThreads[0] - 1) / localThreads[0]; + int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline; + + ((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height; + ((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp; + ((detect_piramid_info *)scaleinfo)[i].imgoff = 0; + ((detect_piramid_info *)scaleinfo)[i].factor = factor; + } + } + + if (loopcount != m_loopcount) + { + if (initialized) + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer)); + } + ((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount); + } + + openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0, + sizeof(detect_piramid_info)*loopcount, + scaleinfo, 0, NULL, NULL)); + free(scaleinfo); + + m_loopcount = loopcount; +} + +void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector& faces, + const std::vector &rectList, + const std::vector &rweights) +{ + CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) ); if( findBiggestObject && rectList.size() ) { @@ -1343,13 +1782,34 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS cvSeqPush( result_seq, &c ); } } - //t = (double)cvGetTickCount() - t; - //printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) ); - //alltime = (double)cvGetTickCount() - alltime; - //printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.) ); - return result_seq; + + vector vecAvgComp; + Seq(result_seq).copyTo(vecAvgComp); + faces.resize(vecAvgComp.size()); + std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect()); } +void cv::ocl::OclCascadeClassifierBuf::release() +{ + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer)); + + if( (m_flags & CV_HAAR_SCALE_IMAGE) ) + { + cvFree(&oldCascade->hid_cascade); + } + else + { + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer)); + openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer)); + } + + free(buffers); + buffers = NULL; +} #ifndef _MAX_PATH #define _MAX_PATH 1024 diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 812a1b294..d703a61b2 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -1012,10 +1012,8 @@ namespace cv warpPerspective_gpu(src, dst, coeffs, interpolation); } - //////////////////////////////////////////////////////////////////////// // integral - void integral(const oclMat &src, oclMat &sum, oclMat &sqsum) { CV_Assert(src.type() == CV_8UC1); @@ -1029,13 +1027,24 @@ namespace cv int vcols = (pre_invalid + src.cols + vlen - 1) / vlen; oclMat t_sum , t_sqsum; - t_sum.create(src.cols, src.rows, CV_32SC1); - t_sqsum.create(src.cols, src.rows, CV_32FC1); - int w = src.cols + 1, h = src.rows + 1; - sum.create(h, w, CV_32SC1); + int depth; + if( src.cols * src.rows <= 2901 * 2901 ) //2901 is the maximum size for int when all values are 255 + { + t_sum.create(src.cols, src.rows, CV_32SC1); + sum.create(h, w, CV_32SC1); + } + else + { + //Use float to prevent overflow + t_sum.create(src.cols, src.rows, CV_32FC1); + sum.create(h, w, CV_32FC1); + } + t_sqsum.create(src.cols, src.rows, CV_32FC1); sqsum.create(h, w, CV_32FC1); - int sum_offset = sum.offset / vlen, sqsum_offset = sqsum.offset / vlen; + depth = sum.depth(); + int sum_offset = sum.offset / vlen; + int sqsum_offset = sqsum.offset / vlen; std::vector > args; args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); @@ -1048,7 +1057,7 @@ namespace cv args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step)); size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, -1); + openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth); args.clear(); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data )); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data )); @@ -1062,9 +1071,9 @@ namespace cv args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum_offset)); size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, -1); - //std::cout << "tested" << std::endl; + openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth); } + void integral(const oclMat &src, oclMat &sum) { CV_Assert(src.type() == CV_8UC1); @@ -1074,10 +1083,18 @@ namespace cv int vcols = (pre_invalid + src.cols + vlen - 1) / vlen; oclMat t_sum; - t_sum.create(src.cols, src.rows, CV_32SC1); - int w = src.cols + 1, h = src.rows + 1; - sum.create(h, w, CV_32SC1); + int depth; + if(src.cols * src.rows <= 2901 * 2901) + { + t_sum.create(src.cols, src.rows, CV_32SC1); + sum.create(h, w, CV_32SC1); + }else + { + t_sum.create(src.cols, src.rows, CV_32FC1); + sum.create(h, w, CV_32FC1); + } + depth = sum.depth(); int sum_offset = sum.offset / vlen; std::vector > args; @@ -1090,7 +1107,7 @@ namespace cv args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step)); size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, -1); + openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth); args.clear(); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data )); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data )); @@ -1100,7 +1117,7 @@ namespace cv args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset)); size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, -1); + openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth); //std::cout << "tested" << std::endl; } diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index 00dd1c698..1a96f785e 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -128,6 +128,8 @@ namespace cv std::vector devices; std::vector devName; String platName; + String clVersion; + cl_context oclcontext; cl_command_queue clCmdQueue; int devnum; @@ -260,7 +262,7 @@ namespace cv int setDevMemType(DevMemRW rw_type, DevMemType mem_type) { - if( (mem_type == DEVICE_MEM_PM && + if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ) return -1; gDeviceMemRW = rw_type; @@ -303,6 +305,7 @@ namespace cv const static int max_name_length = 256; char deviceName[max_name_length]; char plfmName[max_name_length]; + char clVersion[256]; for (unsigned i = 0; i < numPlatforms; ++i) { @@ -322,6 +325,8 @@ namespace cv ocltmpinfo.PlatformName = String(plfmName); ocltmpinfo.impl->platName = String(plfmName); ocltmpinfo.impl->oclplatform = platforms[i]; + openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL)); + ocltmpinfo.impl->clVersion = clVersion; for(unsigned j = 0; j < numsdev; ++j) { ocltmpinfo.impl->devices.push_back(devices[j]); @@ -424,13 +429,13 @@ namespace cv } void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch, - size_t widthInBytes, size_t height, + size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type, void* hptr) { cl_int status; if(hptr && (mem_type==DEVICE_MEM_UHP || mem_type==DEVICE_MEM_CHP)) - *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, - gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type], + *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, + gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type], widthInBytes * height, hptr, &status); else *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type], @@ -985,6 +990,8 @@ namespace cv return impl->double_support == 1; case CL_UNIFIED_MEM: return impl->unified_memory == 1; + case CL_VER_1_2: + return impl->clVersion.find("OpenCL 1.2") != String::npos; default: return false; } diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 4d697a2d5..25252eaac 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -196,7 +196,7 @@ void cv::ocl::oclMat::upload(const Mat &m) // try to use host ptr createEx(wholeSize, m.type(), gDeviceMemRW, gDeviceMemType, m.datastart); if(gDeviceMemType!=DEVICE_MEM_UHP && gDeviceMemType!=DEVICE_MEM_CHP) - openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, + openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice); } @@ -571,11 +571,16 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, Stri CV_Error(Error::StsUnsupportedFormat, "unknown depth"); } #ifdef CL_VERSION_1_2 - if(dst.offset == 0 && dst.cols == dst.wholecols) + //this enables backwards portability to + //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support + if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) && + dst.offset == 0 && dst.cols == dst.wholecols) { - clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL); + clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), + (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL); } else +#endif { args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols )); @@ -583,17 +588,8 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, Stri args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option); + localThreads, args, -1, -1, compile_option); } -#else - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option); -#endif } static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName) @@ -887,7 +883,7 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const } -void cv::ocl::oclMat::createEx(Size size, int type, +void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type, void* hptr) { createEx(size.height, size.width, type, rw_type, mem_type, hptr); @@ -898,7 +894,7 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type) createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType); } -void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, +void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type, void* hptr) { clCxt = Context::getContext(); @@ -919,7 +915,7 @@ void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, size_t esz = elemSize(); void *dev_ptr; - openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), + openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type, hptr); if (esz * cols == step) diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp index 118df5267..0bdaf0d36 100644 --- a/modules/ocl/src/mcwutil.cpp +++ b/modules/ocl/src/mcwutil.cpp @@ -43,11 +43,10 @@ // //M*/ +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS #include "precomp.hpp" -#ifndef CL_VERSION_1_2 -#define CL_VERSION_1_2 0 -#endif +using namespace std; namespace cv { @@ -160,30 +159,44 @@ namespace cv CV_Error(-1, "Image forma is not supported"); break; } -#if CL_VERSION_1_2 - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = mat.cols; - desc.image_height = mat.rows; - desc.image_depth = 0; - desc.image_array_size = 1; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.buffer = NULL; - desc.num_mip_levels = 0; - desc.num_samples = 0; - texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); -#else - texture = clCreateImage2D( - (cl_context)mat.clCxt->oclContext(), - CL_MEM_READ_WRITE, - &format, - mat.cols, - mat.rows, - 0, - NULL, - &err); +#ifdef CL_VERSION_1_2 + //this enables backwards portability to + //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support + if(Context::getContext()->supportsFeature(Context::CL_VER_1_2)) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = mat.cols; + desc.image_height = mat.rows; + desc.image_depth = 0; + desc.image_array_size = 1; + desc.image_row_pitch = 0; + desc.image_slice_pitch = 0; + desc.buffer = NULL; + desc.num_mip_levels = 0; + desc.num_samples = 0; + texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); + } + else #endif + { +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + texture = clCreateImage2D( + (cl_context)mat.clCxt->oclContext(), + CL_MEM_READ_WRITE, + &format, + mat.cols, + mat.rows, + 0, + NULL, + &err); +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + } size_t origin[] = { 0, 0, 0 }; size_t region[] = { mat.cols, mat.rows, 1 }; @@ -196,7 +209,7 @@ namespace cv clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin, regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL); clFlush((cl_command_queue)mat.clCxt->oclCommandQueue()); - } + } else { devData = (cl_mem)mat.data; @@ -212,7 +225,6 @@ namespace cv openCLSafeCall(err); return texture; } - void releaseTexture(cl_mem& texture) { openCLFree(texture); diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp index 116d895db..66a1a3355 100644 --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@ -330,7 +330,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) mom->m12 = dstsum[8]; mom->m03 = dstsum[9]; delete [] dstsum; - + openCLSafeCall(clReleaseMemObject(sum)); icvCompleteMomentState( mom ); } diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl deleted file mode 100644 index ce1ae39f6..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl +++ /dev/null @@ -1,966 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************and with scalar without mask**************************************/ -__kernel void arithm_s_bitwise_and_C1_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_and_C1_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_and_C1_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = (ushort2)(src2.x, src2.x); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data & src2_data; - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C1_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = (short2)(src2.x, src2.x); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - - short2 tmp_data = src1_data & src2_data; - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C1_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - - int data = src_data1 & src_data2; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C1_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - - char4 data = *((__global char4 *)((__global char *)dst + dst_index)); - char4 tmp_data = src1_data & src2_data; - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C1_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - - short4 tmp_data = src1_data & src2_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#endif -__kernel void arithm_s_bitwise_and_C2_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; - - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_and_C2_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_and_C2_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = (ushort2)(src2.x, src2.y); - - ushort2 data = src_data1 & src_data2; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C2_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = (short2)(src2.x, src2.y); - - short2 data = src_data1 & src_data2; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C2_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - - int2 data = src_data1 & src_data2; - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C2_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - - char8 tmp_data = src1_data & src2_data; - - *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C2_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); - short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - - short8 tmp_data = src1_data & src2_data; - - *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#endif -__kernel void arithm_s_bitwise_and_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_and_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0)); - char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1)); - char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2)); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_and_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0; - short2 tmp_data_1 = src1_data_1 & src2_data_1; - short2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0; - int tmp_data_1 = src1_data_1 & src2_data_1; - int tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 & src2_data_0; - short4 tmp_data_1 = src1_data_1 & src2_data_1; - short4 tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif -__kernel void arithm_s_bitwise_and_C4_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - - uchar4 data = src_data1 & src2; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_and_C4_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - - char4 data = src_data1 & src2; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_and_C4_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - - ushort4 data = src_data1 & src2; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C4_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - - short4 data = src_data1 & src2; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C4_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - - int4 data = src_data1 & src2; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_and_C4_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, - src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); - - char16 tmp_data = src1_data & src2_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C4_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0)); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8)); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); - - short4 tmp_data_0 = src1_data_0 & src2_data_0; - short4 tmp_data_1 = src1_data_1 & src2_data_1; - short4 tmp_data_2 = src1_data_2 & src2_data_2; - short4 tmp_data_3 = src1_data_3 & src2_data_3; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; - - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_and.cl b/modules/ocl/src/opencl/arithm_bitwise_binary.cl similarity index 91% rename from modules/ocl/src/opencl/arithm_bitwise_and.cl rename to modules/ocl/src/opencl/arithm_bitwise_binary.cl index f666e0cfb..8bdd23c17 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl @@ -16,6 +16,7 @@ // // @Authors // Jiang Liyuan, jlyuan001.good@163.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -50,11 +51,17 @@ #endif #endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_and without mask**************************************/ -__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset, +//bitwise_binary without mask for and, or, xor operators + +///////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////bitwise_binary/////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifndef OP_BINARY +#define OP_BINARY & +#endif + +__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -95,7 +102,7 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; @@ -107,7 +114,7 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr } -__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset, +__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global char *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -148,7 +155,7 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } char4 dst_data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; @@ -160,7 +167,7 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src } -__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset, +__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global ushort *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -202,7 +209,7 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 tmp_data = src1_data & src2_data; + ushort4 tmp_data = src1_data OP_BINARY src2_data; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; @@ -215,7 +222,7 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s -__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset, +__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global short *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -257,7 +264,7 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 tmp_data = src1_data & src2_data; + short4 tmp_data = src1_data OP_BINARY src2_data; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; @@ -270,7 +277,7 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr -__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset, +__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset, __global int *src2, int src2_step, int src2_offset, __global int *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -286,13 +293,13 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1 int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index)); - int tmp = data1 & data2; + int tmp = data1 OP_BINARY data2; *((__global int *)((__global char *)dst + dst_index)) = tmp; } } -__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset, +__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global char *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -308,14 +315,14 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index)); char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index)); - char4 tmp = data1 & data2; + char4 tmp = data1 OP_BINARY data2; *((__global char4 *)((__global char *)dst + dst_index)) = tmp; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset, +__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global char *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1) @@ -332,7 +339,7 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index)); char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - *((__global char8 *)((__global char *)dst + dst_index)) = data1 & data2; + *((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2; } } #endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl similarity index 59% rename from modules/ocl/src/opencl/arithm_bitwise_and_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl index 1382aa547..60cd18820 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl @@ -16,6 +16,7 @@ // // @Authors // Jiang Liyuan, jlyuan001.good@163.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -49,11 +50,16 @@ #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif #endif + +#ifndef OP_BINARY +#define OP_BINARY & +#endif + ////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_and with mask**************************************/ -__kernel void arithm_bitwise_and_with_mask_C1_D0 ( +////////////////////////////////////////////bitwise_binary//////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////////// +/**************************************bitwise_binary with mask**************************************/ +__kernel void arithm_bitwise_binary_with_mask_C1_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -85,7 +91,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 ( uchar4 mask_data = vload4(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -98,7 +104,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 ( -__kernel void arithm_bitwise_and_with_mask_C1_D1 ( +__kernel void arithm_bitwise_binary_with_mask_C1_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -130,7 +136,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 ( uchar4 mask_data = vload4(0, mask + mask_index); char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -143,7 +149,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 ( -__kernel void arithm_bitwise_and_with_mask_C1_D2 ( +__kernel void arithm_bitwise_binary_with_mask_C1_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -161,7 +167,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -175,7 +181,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 ( uchar2 mask_data = vload2(0, mask + mask_index); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data & src2_data; + ushort2 tmp_data = src1_data OP_BINARY src2_data; data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; @@ -186,7 +192,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 ( -__kernel void arithm_bitwise_and_with_mask_C1_D3 ( +__kernel void arithm_bitwise_binary_with_mask_C1_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -204,7 +210,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -218,7 +224,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 ( uchar2 mask_data = vload2(0, mask + mask_index); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data & src2_data; + short2 tmp_data = src1_data OP_BINARY src2_data; data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; @@ -229,7 +235,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 ( -__kernel void arithm_bitwise_and_with_mask_C1_D4 ( +__kernel void arithm_bitwise_binary_with_mask_C1_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -253,7 +259,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 ( int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); int dst_data = *((__global int *)((__global char *)dst + dst_index)); - int data = src_data1 & src_data2; + int data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global int *)((__global char *)dst + dst_index)) = data; @@ -262,7 +268,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 ( -__kernel void arithm_bitwise_and_with_mask_C1_D5 ( +__kernel void arithm_bitwise_binary_with_mask_C1_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -286,7 +292,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 ( char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index)); char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index)); - char4 data = src_data1 & src_data2; + char4 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global char4 *)((__global char *)dst + dst_index)) = data; @@ -295,7 +301,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 ( -__kernel void arithm_bitwise_and_with_mask_C1_D6 ( +__kernel void arithm_bitwise_binary_with_mask_C1_D6 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -319,7 +325,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 ( char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index)); char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - char8 data = src_data1 & src_data2; + char8 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global char8 *)((__global char *)dst + dst_index)) = data; @@ -329,7 +335,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 ( -__kernel void arithm_bitwise_and_with_mask_C2_D0 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -347,7 +353,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -361,7 +367,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 ( uchar2 mask_data = vload2(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; @@ -371,7 +377,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 ( } -__kernel void arithm_bitwise_and_with_mask_C2_D1 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -389,7 +395,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -403,7 +409,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 ( uchar2 mask_data = vload2(0, mask + mask_index); char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; @@ -412,7 +418,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 ( } } -__kernel void arithm_bitwise_and_with_mask_C2_D2 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -436,13 +442,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 ( ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - ushort2 data = src_data1 & src_data2; + ushort2 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D3 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -466,13 +472,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 ( short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - short2 data = src_data1 & src_data2; + short2 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D4 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -496,13 +502,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 ( int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - int2 data = src_data1 & src_data2; + int2 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D5 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -526,14 +532,14 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 ( char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index)); char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - char8 data = src_data1 & src_data2; + char8 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global char8 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D6 ( +__kernel void arithm_bitwise_binary_with_mask_C2_D6 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -557,7 +563,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 ( char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index)); char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - char16 data = src_data1 & src_data2; + char16 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global char16 *)((__global char *)dst + dst_index)) = data; @@ -565,398 +571,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 ( } - -__kernel void arithm_bitwise_and_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_and_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_and_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0 ; - short2 tmp_data_1 = src1_data_1 & src2_data_1 ; - short2 tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0 ; - int tmp_data_1 = src1_data_1 & src2_data_1 ; - int tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 & src2_data_0; - char8 tmp_data_1 = src1_data_1 & src2_data_1; - char8 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - -__kernel void arithm_bitwise_and_with_mask_C4_D0 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -980,7 +595,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 ( uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 data = src_data1 & src_data2; + uchar4 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global uchar4 *)(dst + dst_index)) = data; @@ -988,7 +603,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 ( } -__kernel void arithm_bitwise_and_with_mask_C4_D1 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1012,14 +627,14 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 ( char4 src_data2 = *((__global char4 *)(src2 + src2_index)); char4 dst_data = *((__global char4 *)(dst + dst_index)); - char4 data = src_data1 & src_data2; + char4 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global char4 *)(dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D2 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1043,13 +658,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 ( ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 data = src_data1 & src_data2; + ushort4 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D3 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1073,13 +688,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 ( short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 data = src_data1 & src_data2; + short4 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D4 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1103,13 +718,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 ( int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - int4 data = src_data1 & src_data2; + int4 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D5 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1133,14 +748,14 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 ( char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index)); char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - char16 data = src_data1 & src_data2; + char16 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global char16 *)((__global char *)dst + dst_index)) = data; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C4_D6 ( +__kernel void arithm_bitwise_binary_with_mask_C4_D6 ( __global char *src1, int src1_step, int src1_offset, __global char *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1175,10 +790,10 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 ( char8 dst_data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); char8 dst_data_3 = *((__global char8 *)((__global char *)dst + dst_index + 24)); - char8 data_0 = src_data1_0 & src_data2_0; - char8 data_1 = src_data1_1 & src_data2_1; - char8 data_2 = src_data1_2 & src_data2_2; - char8 data_3 = src_data1_3 & src_data2_3; + char8 data_0 = src_data1_0 OP_BINARY src_data2_0; + char8 data_1 = src_data1_1 OP_BINARY src_data2_1; + char8 data_2 = src_data1_2 OP_BINARY src_data2_2; + char8 data_3 = src_data1_3 OP_BINARY src_data2_3; data_0 = mask_data ? data_0 : dst_data_0; data_1 = mask_data ? data_1 : dst_data_1; diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl similarity index 55% rename from modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl rename to modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl index 4fe1cc31e..5fa25004d 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl @@ -16,6 +16,7 @@ // // @Authors // Jiang Liyuan, jlyuan001.good@163.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -49,11 +50,16 @@ #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif #endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// + +#ifndef OP_BINARY +#define OP_BINARY & +#endif + /////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************xor with scalar without mask**************************************/ -__kernel void arithm_s_bitwise_xor_C1_D0 ( +////////////////////////////////////////////bitwise_binary///////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////// +/******************************bitwise binary with scalar without mask********************************/ +__kernel void arithm_s_bitwise_binary_C1_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, uchar4 src2, int rows, int cols, int dst_step1) @@ -79,7 +85,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 ( uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -91,7 +97,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 ( } -__kernel void arithm_s_bitwise_xor_C1_D1 ( +__kernel void arithm_s_bitwise_binary_C1_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, char4 src2, int rows, int cols, int dst_step1) @@ -117,7 +123,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 ( char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -128,7 +134,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 ( } } -__kernel void arithm_s_bitwise_xor_C1_D2 ( +__kernel void arithm_s_bitwise_binary_C1_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *dst, int dst_step, int dst_offset, ushort4 src2, int rows, int cols, int dst_step1) @@ -155,7 +161,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 ( ushort2 src2_data = (ushort2)(src2.x, src2.x); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data ^ src2_data; + ushort2 tmp_data = src1_data OP_BINARY src2_data; data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; @@ -163,7 +169,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 ( *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C1_D3 ( +__kernel void arithm_s_bitwise_binary_C1_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, short4 src2, int rows, int cols, int dst_step1) @@ -190,7 +196,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 ( short2 src2_data = (short2)(src2.x, src2.x); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data ^ src2_data; + short2 tmp_data = src1_data OP_BINARY src2_data; data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; @@ -198,7 +204,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 ( *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C1_D4 ( +__kernel void arithm_s_bitwise_binary_C1_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1) @@ -215,12 +221,12 @@ __kernel void arithm_s_bitwise_xor_C1_D4 ( int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); int src_data2 = src2.x; - int data = src_data1 ^ src_data2; + int data = src_data1 OP_BINARY src_data2; *((__global int *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C1_D5 ( +__kernel void arithm_s_bitwise_binary_C1_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, char16 src2, int rows, int cols, int dst_step1) @@ -241,7 +247,7 @@ __kernel void arithm_s_bitwise_xor_C1_D5 ( char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); char4 data = *((__global char4 *)((__global char *)dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -251,9 +257,8 @@ __kernel void arithm_s_bitwise_xor_C1_D5 ( *((__global char4 *)((__global char *)dst + dst_index)) = data; } } - #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C1_D6 ( +__kernel void arithm_s_bitwise_binary_C1_D6 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, short16 src2, int rows, int cols, int dst_step1) @@ -270,13 +275,13 @@ __kernel void arithm_s_bitwise_xor_C1_D6 ( short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 tmp_data = src1_data ^ src2_data; + short4 tmp_data = src1_data OP_BINARY src2_data; *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data; } } #endif -__kernel void arithm_s_bitwise_xor_C2_D0 ( +__kernel void arithm_s_bitwise_binary_C2_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, uchar4 src2, int rows, int cols, int dst_step1) @@ -303,7 +308,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 ( uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; @@ -314,7 +319,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 ( } -__kernel void arithm_s_bitwise_xor_C2_D1 ( +__kernel void arithm_s_bitwise_binary_C2_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, char4 src2, int rows, int cols, int dst_step1) @@ -341,7 +346,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 ( char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; @@ -350,7 +355,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 ( } } -__kernel void arithm_s_bitwise_xor_C2_D2 ( +__kernel void arithm_s_bitwise_binary_C2_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *dst, int dst_step, int dst_offset, ushort4 src2, int rows, int cols, int dst_step1) @@ -367,12 +372,12 @@ __kernel void arithm_s_bitwise_xor_C2_D2 ( ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); ushort2 src_data2 = (ushort2)(src2.x, src2.y); - ushort2 data = src_data1 ^ src_data2; + ushort2 data = src_data1 OP_BINARY src_data2; *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C2_D3 ( +__kernel void arithm_s_bitwise_binary_C2_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, short4 src2, int rows, int cols, int dst_step1) @@ -389,12 +394,12 @@ __kernel void arithm_s_bitwise_xor_C2_D3 ( short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); short2 src_data2 = (short2)(src2.x, src2.y); - short2 data = src_data1 ^ src_data2; + short2 data = src_data1 OP_BINARY src_data2; *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C2_D4 ( +__kernel void arithm_s_bitwise_binary_C2_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1) @@ -411,11 +416,11 @@ __kernel void arithm_s_bitwise_xor_C2_D4 ( int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); int2 src_data2 = (int2)(src2.x, src2.y); - int2 data = src_data1 ^ src_data2; + int2 data = src_data1 OP_BINARY src_data2; *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C2_D5 ( +__kernel void arithm_s_bitwise_binary_C2_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, char16 src2, int rows, int cols, int dst_step1) @@ -432,13 +437,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 ( char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index)); char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - char8 tmp_data = src1_data ^ src2_data; + char8 tmp_data = src1_data OP_BINARY src2_data; *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C2_D6 ( +__kernel void arithm_s_bitwise_binary_C2_D6 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, short16 src2, int rows, int cols, int dst_step1) @@ -455,347 +460,14 @@ __kernel void arithm_s_bitwise_xor_C2_D6 ( short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - short8 tmp_data = src1_data ^ src2_data; + short8 tmp_data = src1_data OP_BINARY src2_data; *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data; } } #endif -__kernel void arithm_s_bitwise_xor_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_xor_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_xor_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0; - int tmp_data_1 = src1_data_1 ^ src2_data_1; - int tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif -__kernel void arithm_s_bitwise_xor_C4_D0 ( +__kernel void arithm_s_bitwise_binary_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, uchar4 src2, int rows, int cols, int dst_step1) @@ -811,14 +483,14 @@ __kernel void arithm_s_bitwise_xor_C4_D0 ( uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 data = src_data1 ^ src2; + uchar4 data = src_data1 OP_BINARY src2; *((__global uchar4 *)(dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D1 ( +__kernel void arithm_s_bitwise_binary_C4_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, char4 src2, int rows, int cols, int dst_step1) @@ -834,13 +506,13 @@ __kernel void arithm_s_bitwise_xor_C4_D1 ( char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - char4 data = src_data1 ^ src2; + char4 data = src_data1 OP_BINARY src2; *((__global char4 *)(dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D2 ( +__kernel void arithm_s_bitwise_binary_C4_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *dst, int dst_step, int dst_offset, ushort4 src2, int rows, int cols, int dst_step1) @@ -856,12 +528,12 @@ __kernel void arithm_s_bitwise_xor_C4_D2 ( ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 data = src_data1 ^ src2; + ushort4 data = src_data1 OP_BINARY src2; *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D3 ( +__kernel void arithm_s_bitwise_binary_C4_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, short4 src2, int rows, int cols, int dst_step1) @@ -877,12 +549,12 @@ __kernel void arithm_s_bitwise_xor_C4_D3 ( short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 data = src_data1 ^ src2; + short4 data = src_data1 OP_BINARY src2; *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D4 ( +__kernel void arithm_s_bitwise_binary_C4_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1) @@ -898,12 +570,12 @@ __kernel void arithm_s_bitwise_xor_C4_D4 ( int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 data = src_data1 ^ src2; + int4 data = src_data1 OP_BINARY src2; *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D5 ( +__kernel void arithm_s_bitwise_binary_C4_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, char16 src2, int rows, int cols, int dst_step1) @@ -921,13 +593,13 @@ __kernel void arithm_s_bitwise_xor_C4_D5 ( char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); - char16 tmp_data = src1_data ^ src2_data; + char16 tmp_data = src1_data OP_BINARY src2_data; *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C4_D6 ( +__kernel void arithm_s_bitwise_binary_C4_D6 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, short16 src2, int rows, int cols, int dst_step1) @@ -951,10 +623,10 @@ __kernel void arithm_s_bitwise_xor_C4_D6 ( short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - short4 tmp_data_3 = src1_data_3 ^ src2_data_3; + short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0; + short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1; + short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2; + short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3; *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; @@ -963,4 +635,4 @@ __kernel void arithm_s_bitwise_xor_C4_D6 ( } } -#endif \ No newline at end of file +#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl similarity index 58% rename from modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl index b739ea1e7..9af6589ad 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl @@ -49,11 +49,16 @@ #pragma OPENCL EXTENSION cl_amd_fp64:enable #endif #endif + +#ifndef OP_BINARY +#define OP_BINARY & +#endif + ////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_and with scalar with mask**************************************/ -__kernel void arithm_s_bitwise_and_with_mask_C1_D0 ( +////////////////////////////////////////////bitwise_binary//////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////////// +/**************************************bitwise_binary with scalar with mask**************************************/ +__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -83,7 +88,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 ( uchar4 mask_data = vload4(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -95,7 +100,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 ( } -__kernel void arithm_s_bitwise_and_with_mask_C1_D1 ( +__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -125,7 +130,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 ( uchar4 mask_data = vload4(0, mask + mask_index); char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; @@ -136,7 +141,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 ( } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D2 ( +__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -153,7 +158,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -166,7 +171,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 ( uchar2 mask_data = vload2(0, mask + mask_index); ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data & src2_data; + ushort2 tmp_data = src1_data OP_BINARY src2_data; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; @@ -174,7 +179,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 ( *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D3 ( +__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -191,7 +196,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -204,7 +209,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 ( uchar2 mask_data = vload2(0, mask + mask_index); short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data & src2_data; + short2 tmp_data = src1_data OP_BINARY src2_data; data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; @@ -212,7 +217,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 ( *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D4 ( +__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -234,14 +239,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 ( int src_data2 = src2.x; int dst_data = *((__global int *)((__global char *)dst + dst_index)); - int data = src_data1 & src_data2; + int data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global int *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D5 ( +__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -263,7 +268,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 ( char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index)); - char4 data = src1_data & src2_data; + char4 data = src1_data OP_BINARY src2_data; data = mask_data ? data : dst_data; *((__global char4 *)((__global char *)dst + dst_index)) = data; @@ -271,7 +276,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 ( } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C1_D6 ( +__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -293,14 +298,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 ( short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 data = src1_data & src2_data; + short4 data = src1_data OP_BINARY src2_data; data = mask_data ? data : dst_data; *((__global short4 *)((__global char *)dst + dst_index)) = data; } } #endif -__kernel void arithm_s_bitwise_and_with_mask_C2_D0 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -317,7 +322,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -330,7 +335,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 ( uchar2 mask_data = vload2(0, mask + mask_index); uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data & src2_data; + uchar4 tmp_data = src1_data OP_BINARY src2_data; data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; @@ -340,7 +345,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 ( } -__kernel void arithm_s_bitwise_and_with_mask_C2_D1 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -357,7 +362,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 ( #ifdef dst_align #undef dst_align #endif -#define dst_align ((dst_offset >> 1) & 1) +#define dst_align ((dst_offset / 2) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -370,7 +375,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 ( uchar2 mask_data = vload2(0, mask + mask_index); char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data & src2_data; + char4 tmp_data = src1_data OP_BINARY src2_data; data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; @@ -379,7 +384,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 ( } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D2 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -401,13 +406,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 ( ushort2 src_data2 = (ushort2)(src2.x, src2.y); ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - ushort2 data = src_data1 & src_data2; + ushort2 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D3 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -429,13 +434,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 ( short2 src_data2 = (short2)(src2.x, src2.y); short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - short2 data = src_data1 & src_data2; + short2 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D4 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -457,13 +462,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 ( int2 src_data2 = (int2)(src2.x, src2.y); int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - int2 data = src_data1 & src_data2; + int2 data = src_data1 OP_BINARY src_data2; data = mask_data ? data : dst_data; *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D5 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -485,7 +490,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 ( char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - char8 data = src1_data & src2_data; + char8 data = src1_data OP_BINARY src2_data; data = mask_data ? data : dst_data; @@ -493,7 +498,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 ( } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C2_D6 ( +__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -515,388 +520,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 ( short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index)); - short8 data = src1_data & src2_data; + short8 data = src1_data OP_BINARY src2_data; data = mask_data ? data : dst_data; *((__global short8 *)((__global char *)dst + dst_index)) = data; } } #endif -__kernel void arithm_s_bitwise_and_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_and_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_and_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0; - short2 tmp_data_1 = src1_data_1 & src2_data_1; - short2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0; - int tmp_data_1 = src1_data_1 & src2_data_1; - int tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 & src2_data_0; - short4 tmp_data_1 = src1_data_1 & src2_data_1; - short4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif -__kernel void arithm_s_bitwise_and_with_mask_C4_D0 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -917,7 +548,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 ( uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 data = src_data1 & src2; + uchar4 data = src_data1 OP_BINARY src2; data = mask_data ? data : dst_data; *((__global uchar4 *)(dst + dst_index)) = data; @@ -925,7 +556,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 ( } -__kernel void arithm_s_bitwise_and_with_mask_C4_D1 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -946,14 +577,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 ( char4 src_data1 = *((__global char4 *)(src1 + src1_index)); char4 dst_data = *((__global char4 *)(dst + dst_index)); - char4 data = src_data1 & src2; + char4 data = src_data1 OP_BINARY src2; data = mask_data ? data : dst_data; *((__global char4 *)(dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D2 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 ( __global ushort *src1, int src1_step, int src1_offset, __global ushort *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -974,13 +605,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 ( ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 data = src_data1 & src2; + ushort4 data = src_data1 OP_BINARY src2; data = mask_data ? data : dst_data; *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D3 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1001,13 +632,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 ( short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 data = src_data1 & src2; + short4 data = src_data1 OP_BINARY src2; data = mask_data ? data : dst_data; *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D4 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 ( __global int *src1, int src1_step, int src1_offset, __global int *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1028,13 +659,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 ( int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - int4 data = src_data1 & src2; + int4 data = src_data1 OP_BINARY src2; data = mask_data ? data : dst_data; *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D5 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 ( __global char *src1, int src1_step, int src1_offset, __global char *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1057,14 +688,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 ( src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - char16 data = src1_data & src2_data; + char16 data = src1_data OP_BINARY src2_data; data = mask_data ? data : dst_data; *((__global char16 *)((__global char *)dst + dst_index)) = data; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C4_D6 ( +__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 ( __global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, @@ -1097,10 +728,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 ( short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); short4 dst_data_3 = *((__global short4 *)((__global char *)dst + dst_index + 24)); - short4 data_0 = src1_data_0 & src2_data_0; - short4 data_1 = src1_data_1 & src2_data_1; - short4 data_2 = src1_data_2 & src2_data_2; - short4 data_3 = src1_data_3 & src2_data_3; + short4 data_0 = src1_data_0 OP_BINARY src2_data_0; + short4 data_1 = src1_data_1 OP_BINARY src2_data_1; + short4 data_2 = src1_data_2 OP_BINARY src2_data_2; + short4 data_3 = src1_data_3 OP_BINARY src2_data_3; data_0 = mask_data ? data_0 : dst_data_0; data_1 = mask_data ? data_1 : dst_data_1; diff --git a/modules/ocl/src/opencl/arithm_bitwise_or.cl b/modules/ocl/src/opencl/arithm_bitwise_or.cl deleted file mode 100644 index dd7c53c7f..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_or.cl +++ /dev/null @@ -1,294 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_or without mask**************************************/ -__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} - - -__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); - - char4 dst_data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global char4 *)(dst + dst_index)) = dst_data; - } -} - - -__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 tmp_data = src1_data | src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - - - -__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - short4 tmp_data = src1_data | src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - - - -__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - int tmp = data1 | data2; - - *((__global int *)((__global char *)dst + dst_index)) = tmp; - } -} - -__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index)); - char4 tmp = data1 | data2; - - *((__global char4 *)((__global char *)dst + dst_index)) = tmp; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - - *((__global char8 *)((__global char *)dst + dst_index)) = data1 | data2; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl deleted file mode 100644 index 0242c8673..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl +++ /dev/null @@ -1,1194 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_or with mask**************************************/ -__kernel void arithm_bitwise_or_with_mask_C1_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_or_with_mask_C1_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = convert_char((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = convert_char((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_or_with_mask_C1_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data | src2_data; - - data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_or_with_mask_C1_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data | src2_data; - - data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_or_with_mask_C1_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_or_with_mask_C1_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index)); - char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index)); - - char4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char4 *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C1_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - - char8 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char8 *)((__global char *)dst + dst_index)) = data; - } - -} -#endif - - -__kernel void arithm_bitwise_or_with_mask_C2_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_bitwise_or_with_mask_C2_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_bitwise_or_with_mask_C2_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - ushort2 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_or_with_mask_C2_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - short2 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_or_with_mask_C2_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_or_with_mask_C2_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - - char8 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char8 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C2_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index)); - char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - - char16 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - - -__kernel void arithm_bitwise_or_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_or_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_or_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0 ; - short2 tmp_data_1 = src1_data_1 | src2_data_1 ; - short2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0 ; - int tmp_data_1 = src1_data_1 | src2_data_1 ; - int tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 | src2_data_0; - char8 tmp_data_1 = src1_data_1 | src2_data_1; - char8 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - -__kernel void arithm_bitwise_or_with_mask_C4_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_bitwise_or_with_mask_C4_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - char4 src_data2 = *((__global char4 *)(src2 + src2_index)); - char4 dst_data = *((__global char4 *)(dst + dst_index)); - - char4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_bitwise_or_with_mask_C4_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_or_with_mask_C4_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_or_with_mask_C4_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_or_with_mask_C4_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index)); - char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - - char16 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C4_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 5) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0)); - char8 src_data1_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8)); - char8 src_data1_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - char8 src_data1_3 = *((__global char8 *)((__global char *)src1 + src1_index + 24)); - - char8 src_data2_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0)); - char8 src_data2_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8)); - char8 src_data2_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - char8 src_data2_3 = *((__global char8 *)((__global char *)src2 + src2_index + 24)); - - char8 dst_data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0)); - char8 dst_data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8)); - char8 dst_data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - char8 dst_data_3 = *((__global char8 *)((__global char *)dst + dst_index + 24)); - - char8 data_0 = src_data1_0 | src_data2_0; - char8 data_1 = src_data1_1 | src_data2_1; - char8 data_2 = src_data1_2 | src_data2_2; - char8 data_3 = src_data1_3 | src_data2_3; - - data_0 = mask_data ? data_0 : dst_data_0; - data_1 = mask_data ? data_1 : dst_data_1; - data_2 = mask_data ? data_2 : dst_data_2; - data_3 = mask_data ? data_3 : dst_data_3; - - *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16)) = data_2; - *((__global char8 *)((__global char *)dst + dst_index + 24)) = data_3; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl deleted file mode 100644 index 2730f9dad..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl +++ /dev/null @@ -1,973 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************and with scalar without mask**************************************/ -__kernel void arithm_s_bitwise_or_C1_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_or_C1_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_C1_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = (ushort2)(src2.x, src2.x); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data | src2_data; - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C1_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = (short2)(src2.x, src2.x); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - - short2 tmp_data = src1_data | src2_data; - - data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; - data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C1_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - - int data = src_data1 | src_data2; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C1_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 src_data2 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - - char4 data = src_data1 | src_data2; - - *((__global char4 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C1_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - - short4 tmp_data = src1_data | src2_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#endif -__kernel void arithm_s_bitwise_or_C2_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_or_C2_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; - data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_C2_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = (ushort2)(src2.x, src2.y); - - ushort2 data = src_data1 | src_data2; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C2_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = (short2)(src2.x, src2.y); - - short2 data = src_data1 | src_data2; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - - int2 data = src_data1 | src_data2; - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C2_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - - char8 data = src_data1 | src_data2; - *((__global char8 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C2_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); - short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - - short8 tmp_data = src1_data & src2_data; - - *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data; - } -} -#endif -__kernel void arithm_s_bitwise_or_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_or_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_or_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0 ; - short2 tmp_data_1 = src1_data_1 | src2_data_1 ; - short2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0; - int tmp_data_1 = src1_data_1 | src2_data_1; - int tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif -__kernel void arithm_s_bitwise_or_C4_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - - uchar4 data = src_data1 | src2; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_or_C4_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - - char4 data = src_data1 | src2; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_C4_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - - ushort4 data = src_data1 | src2; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C4_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - - short4 data = src_data1 | src2; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C4_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - - int4 data = src_data1 | src2; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_C4_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src_data2 = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, - src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); - - char16 data = src_data1 | src_data2; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C4_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0)); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8)); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - short4 tmp_data_3 = src1_data_3 | src2_data_3; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; - - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl deleted file mode 100644 index 9184ff706..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl +++ /dev/null @@ -1,1140 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_or with scalar with mask**************************************/ -__kernel void arithm_s_bitwise_or_with_mask_C1_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_or_with_mask_C1_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C1_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = (ushort2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data | src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C1_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = (short2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data | src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C1_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C1_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 src_data2 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index)); - - char4 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char4 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C1_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = src1_data | src2_data; - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_bitwise_or_with_mask_C2_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data | src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_or_with_mask_C2_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data | src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C2_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = (ushort2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - ushort2 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C2_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = (short2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - short2 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C2_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C2_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - char8 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - *((__global char8 *)((__global char *)dst + dst_index)) = data; - - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C2_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); - short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index)); - - short8 data = src1_data | src2_data; - data = mask_data ? data : dst_data; - - *((__global short8 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_bitwise_or_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_or_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0; - short2 tmp_data_1 = src1_data_1 | src2_data_1; - short2 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0; - int tmp_data_1 = src1_data_1 | src2_data_1; - int tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif -__kernel void arithm_s_bitwise_or_with_mask_C4_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = src_data1 | src2; - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_or_with_mask_C4_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - char4 dst_data = *((__global char4 *)(dst + dst_index)); - - char4 data = src_data1 | src2; - data = mask_data ? data : dst_data; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C4_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = src_data1 | src2; - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C4_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = src_data1 | src2; - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C4_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = src_data1 | src2; - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C4_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src_data2 = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, - src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); - char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - - char16 data = src_data1 | src_data2; - data = mask_data ? data : dst_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C4_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0)); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8)); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); - - short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0)); - short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8)); - short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - short4 dst_data_3 = *((__global short4 *)((__global char *)dst + dst_index + 24)); - - short4 data_0 = src1_data_0 | src2_data_0; - short4 data_1 = src1_data_1 | src2_data_1; - short4 data_2 = src1_data_2 | src2_data_2; - short4 data_3 = src1_data_3 | src2_data_3; - - data_0 = mask_data ? data_0 : dst_data_0; - data_1 = mask_data ? data_1 : dst_data_1; - data_2 = mask_data ? data_2 : dst_data_2; - data_3 = mask_data ? data_3 : dst_data_3; - - *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8)) = data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16)) = data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24)) = data_3; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor.cl b/modules/ocl/src/opencl/arithm_bitwise_xor.cl deleted file mode 100644 index 4b34af152..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_xor.cl +++ /dev/null @@ -1,340 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_xor without mask**************************************/ -__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - uchar4 src2_data = vload4(0, src2 + src2_index_fix); - - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global uchar4 *)(dst + dst_index)) = dst_data; - } -} - - -__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - char4 src1_data = vload4(0, src1 + src1_index_fix); - char4 src2_data = vload4(0, src2 + src2_index_fix); - - if(src1_index < 0) - { - char4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - char4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - char4 dst_data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global char4 *)(dst + dst_index)) = dst_data; - } -} - - -__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); - - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - ushort4 tmp_data = src1_data ^ src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - - - -__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); - - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - - - short4 tmp_data = src1_data ^ src2_data; - - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; - - *((__global short4 *)((__global char *)dst + dst_index)) = dst_data; - } -} - - - -__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - int data1 = *((__global int *)((__global char *)src1 + src1_index)); - int data2 = *((__global int *)((__global char *)src2 + src2_index)); - int tmp = data1 ^ data2; - - *((__global int *)((__global char *)dst + dst_index)) = tmp; - } -} - -__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index)); - char4 tmp = data1 ^ data2; - - *((__global char4 *)((__global char *)dst + dst_index)) = tmp; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - - *((__global char8 *)((__global char *)dst + dst_index)) = data1 ^ data2; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl deleted file mode 100644 index 25ed0113a..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl +++ /dev/null @@ -1,1194 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_xor with mask**************************************/ -__kernel void arithm_bitwise_xor_with_mask_C1_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_xor_with_mask_C1_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; - - data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = convert_char((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = convert_char((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_xor_with_mask_C1_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data ^ src2_data; - - data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_xor_with_mask_C1_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data ^ src2_data; - - data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_xor_with_mask_C1_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = *((__global int *)((__global char *)src2 + src2_index)); - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - - - -__kernel void arithm_bitwise_xor_with_mask_C1_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index)); - char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index)); - - char4 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global char4 *)((__global char *)dst + dst_index)) = data; - } -} - - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C1_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - - char8 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global char8 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - - - -__kernel void arithm_bitwise_xor_with_mask_C2_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_bitwise_xor_with_mask_C2_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = vload4(0, src2 + src2_index); - uchar2 mask_data = vload2(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_bitwise_xor_with_mask_C2_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index)); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - ushort2 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_xor_with_mask_C2_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index)); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - short2 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_xor_with_mask_C2_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index)); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_xor_with_mask_C2_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index)); - char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - - char8 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global char8 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C2_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index)); - char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - - char16 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#endif - - -__kernel void arithm_bitwise_xor_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_xor_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_xor_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0 ; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1 ; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0 ; - int tmp_data_1 = src1_data_1 ^ src2_data_1 ; - int tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 ^ src2_data_0; - char8 tmp_data_1 = src1_data_1 ^ src2_data_1; - char8 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - -__kernel void arithm_bitwise_xor_with_mask_C4_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_bitwise_xor_with_mask_C4_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - char4 src_data2 = *((__global char4 *)(src2 + src2_index)); - char4 dst_data = *((__global char4 *)(dst + dst_index)); - - char4 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_bitwise_xor_with_mask_C4_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_xor_with_mask_C4_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_xor_with_mask_C4_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_bitwise_xor_with_mask_C4_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 4) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index)); - char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - - char16 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C4_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int src2_index = mad24(y, src2_step, (x << 5) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src_data1_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0)); - char8 src_data1_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8)); - char8 src_data1_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - char8 src_data1_3 = *((__global char8 *)((__global char *)src1 + src1_index + 24)); - - char8 src_data2_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0)); - char8 src_data2_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8)); - char8 src_data2_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - char8 src_data2_3 = *((__global char8 *)((__global char *)src2 + src2_index + 24)); - - char8 dst_data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0)); - char8 dst_data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8)); - char8 dst_data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - char8 dst_data_3 = *((__global char8 *)((__global char *)dst + dst_index + 24)); - - char8 data_0 = src_data1_0 ^ src_data2_0; - char8 data_1 = src_data1_1 ^ src_data2_1; - char8 data_2 = src_data1_2 ^ src_data2_2; - char8 data_3 = src_data1_3 ^ src_data2_3; - - data_0 = mask_data ? data_0 : dst_data_0; - data_1 = mask_data ? data_1 : dst_data_1; - data_2 = mask_data ? data_2 : dst_data_2; - data_3 = mask_data ? data_3 : dst_data_3; - - *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16)) = data_2; - *((__global char8 *)((__global char *)dst + dst_index + 24)) = data_3; - } -} -#endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl deleted file mode 100644 index 06672b8c3..000000000 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl +++ /dev/null @@ -1,1117 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -////////////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////// -/**************************************bitwise_xor with scalar with mask**************************************/ -__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; - data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; - data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort2 src2_data = (ushort2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); - ushort2 tmp_data = src1_data ^ src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); - short2 src2_data = (short2)(src2.x, src2.x); - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data ^ src2_data; - - data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x; - data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y; - - *((__global short2 *)((__global uchar *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); - int src_data2 = src2.x; - int dst_data = *((__global int *)((__global char *)dst + dst_index)); - - int data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global int *)((__global char *)dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index)); - char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index)); - - char4 data = src1_data ^ src2_data; - data = mask_data ? data : dst_data; - - *((__global char4 *)((__global char *)dst + dst_index)) = data; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = src1_data ^ src2_data; - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - uchar4 data = *((__global uchar4 *)(dst + dst_index)); - uchar4 tmp_data = src1_data ^ src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align ((dst_offset >> 1) & 1) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); - - char4 src1_data = vload4(0, src1 + src1_index); - char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); - uchar2 mask_data = vload2(0, mask + mask_index); - - char4 data = *((__global char4 *)(dst + dst_index)); - char4 tmp_data = src1_data ^ src2_data; - - data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy; - data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); - ushort2 src_data2 = (ushort2)(src2.x, src2.y); - ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index)); - - ushort2 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global ushort2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); - short2 src_data2 = (short2)(src2.x, src2.y); - short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index)); - - short2 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global short2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); - int2 src_data2 = (int2)(src2.x, src2.y); - int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index)); - - int2 data = src_data1 ^ src_data2; - data = mask_data ? data : dst_data; - - *((__global int2 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index)); - char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - - char8 data = src1_data ^ src2_data; - - data = mask_data ? data : dst_data; - - *((__global char8 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); - short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); - short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index)); - - short8 data = src1_data ^ src2_data; - data = mask_data ? data : dst_data; - - *((__global short8 *)((__global char *)dst + dst_index)) = data; - } -} -#endif -__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0; - int tmp_data_1 = src1_data_1 ^ src2_data_1; - int tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif -__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - - uchar4 data = src_data1 ^ src2; - data = mask_data ? data : dst_data; - - *((__global uchar4 *)(dst + dst_index)) = data; - } -} - - -__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char4 src_data1 = *((__global char4 *)(src1 + src1_index)); - char4 dst_data = *((__global char4 *)(dst + dst_index)); - - char4 data = src_data1 ^ src2; - data = mask_data ? data : dst_data; - - *((__global char4 *)(dst + dst_index)) = data; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); - ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - - ushort4 data = src_data1 ^ src2; - data = mask_data ? data : dst_data; - - *((__global ushort4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); - short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - - short4 data = src_data1 ^ src2; - data = mask_data ? data : dst_data; - - *((__global short4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); - int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - - int4 data = src_data1 ^ src2; - data = mask_data ? data : dst_data; - - *((__global int4 *)((__global char *)dst + dst_index)) = data; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index)); - char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, - src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); - char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index)); - - char16 data = src1_data ^ src2_data; - data = mask_data ? data : dst_data; - - *((__global char16 *)((__global char *)dst + dst_index)) = data; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); - - uchar mask_data = *(mask + mask_index); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0)); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8)); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); - - short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0)); - short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8)); - short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - short4 dst_data_3 = *((__global short4 *)((__global char *)dst + dst_index + 24)); - - short4 data_0 = src1_data_0 ^ src2_data_0; - short4 data_1 = src1_data_1 ^ src2_data_1; - short4 data_2 = src1_data_2 ^ src2_data_2; - short4 data_3 = src1_data_3 ^ src2_data_3; - - data_0 = mask_data ? data_0 : dst_data_0; - data_1 = mask_data ? data_1 : dst_data_1; - data_2 = mask_data ? data_2 : dst_data_2; - data_3 = mask_data ? data_3 : dst_data_3; - - *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8)) = data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16)) = data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24)) = data_3; - } -} -#endif diff --git a/modules/ocl/src/opencl/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl index 79ca8d735..512e32997 100644 --- a/modules/ocl/src/opencl/filtering_boxFilter.cl +++ b/modules/ocl/src/opencl/filtering_boxFilter.cl @@ -79,15 +79,73 @@ #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) #endif +#define THREADS 256 +#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) + +inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp, + int dst_rows, int dst_cols, + int dst_startX, int dst_x_off, + float alpha) +{ + if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1)) + { + return; + } + + uint4 tmp_sum = 0; + int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4; + int posY = (get_group_id(1) << 1); + + for(int i=-anX; i<=anX; i++) + { + tmp_sum += vload4(get_local_id(0), temp+i); + } + + if(posY < dst_rows && posX < dst_cols) + { + tmp_sum /= (uint4) alpha; + if(posX >= 0 && posX < dst_cols) + *(dst) = tmp_sum.x; + if(posX+1 >= 0 && posX+1 < dst_cols) + *(dst + 1) = tmp_sum.y; + if(posX+2 >= 0 && posX+2 < dst_cols) + *(dst + 2) = tmp_sum.z; + if(posX+3 >= 0 && posX+3 < dst_cols) + *(dst + 3) = tmp_sum.w; + } +} + + +inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp, + int dst_rows, int dst_cols, + int dst_startX, int dst_x_off, + float alpha) +{ + if(get_local_id(0) >= (THREADS-ksX+1)) + { + return; + } + + int posX = dst_startX - dst_x_off + get_local_id(0); + int posY = (get_group_id(1) << 1); + + uint4 temp_sum = 0; + for(int i=-anX; i<=anX; i++) + { + temp_sum += temp[get_local_id(0) + anX + i]; + } + + if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows) + *dst = convert_uchar4(convert_float4(temp_sum)/alpha); +} + /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////8uC1//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// -#define THREADS 256 -#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); @@ -105,115 +163,84 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha int dst_startY = (gY << 1) + dst_y_off; uint4 data[ksY+1]; - __local uint4 temp[(THREADS<<1)]; + __local uint4 temp[2][THREADS]; #ifdef BORDER_CONSTANT - for(int i=0; i < ksY+1; i++) + for(int i=0; i < ksY+1; i++) + { + if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3src_whole_cols-1) - | (startY+i<0) | (startY+i>src_whole_rows-1); - if(not_all_in_range) - { - int selected_row; - int4 selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + int not_all_in_range; + for(int i=0; i < ksY+1; i++) + { + not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1) + | (startY+i<0) | (startY+i>src_whole_rows-1); + if(not_all_in_range) + { + int selected_row; + int4 selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); - selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); + selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); + selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); - selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); - selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); + selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); + selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); - selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); - selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); + selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); + selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); - selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); - selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); + selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); + selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); - data[i].x = *(src + selected_row * src_step + selected_col.x); - data[i].y = *(src + selected_row * src_step + selected_col.y); - data[i].z = *(src + selected_row * src_step + selected_col.z); - data[i].w = *(src + selected_row * src_step + selected_col.w); - } - else - { - data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); - } - } + data[i].x = *(src + selected_row * src_step + selected_col.x); + data[i].y = *(src + selected_row * src_step + selected_col.y); + data[i].z = *(src + selected_row * src_step + selected_col.z); + data[i].w = *(src + selected_row * src_step + selected_col.w); + } + else + { + data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); + } + } #endif - uint4 sum0 = 0, sum1 = 0, sum2 = 0; + uint4 tmp_sum = 0; for(int i=1; i < ksY; i++) { - sum0 += (data[i]); + tmp_sum += (data[i]); } - sum1 = sum0 + (data[0]); - sum2 = sum0 + (data[ksY]); + + int index = dst_startY * dst_step + dst_startX + (col-anX)*4; - temp[col] = sum1; - temp[col+THREADS] = sum2; + temp[0][col] = tmp_sum + (data[0]); + temp[1][col] = tmp_sum + (data[ksY]); barrier(CLK_LOCAL_MEM_FENCE); - - if(col >= anX && col < (THREADS-ksX+anX+1)) - { - int posX = dst_startX - dst_x_off + (col-anX)*4; - int posY = (gY << 1); - uint4 tmp_sum1=0, tmp_sum2=0; - for(int i=-anX; i<=anX; i++) - { - tmp_sum1 += vload4(col, (__local uint*)temp+i); - } - - for(int i=-anX; i<=anX; i++) - { - tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i); - } - - if(posY < dst_rows && posX < dst_cols) - { - if(posX >= 0 && posX < dst_cols) - *(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum1.x/alpha; - if(posX+1 >= 0 && posX+1 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum1.y/alpha; - if(posX+2 >= 0 && posX+2 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha; - if(posX+3 >= 0 && posX+3 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha; - } - if(posY+1 < dst_rows && posX < dst_cols) - { - dst_startY+=1; - if(posX >= 0 && posX < dst_cols) - *(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum2.x/alpha; - if(posX+1 >= 0 && posX+1 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum2.y/alpha; - if(posX+2 >= 0 && posX+2 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha; - if(posX+3 >= 0 && posX+3 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha; - } - } + update_dst_C1_D0(dst+index, (__local uint *)(temp[0]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); } @@ -221,9 +248,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha /////////////////////////////////////////8uC4//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); const int gX = get_group_id(0); @@ -238,81 +265,63 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - //int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; - int end_addr = src_whole_cols-4; uint4 data[ksY+1]; __local uint4 temp[2][THREADS]; + #ifdef BORDER_CONSTANT bool con; - uint4 ss; for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - - //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); - //ss = convert_uint4(src[cur_addr]); - int cur_col = clamp(startX + col, 0, src_whole_cols); - if(con) - ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); - data[i] = con ? ss : 0; + data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0; + data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0; + data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0; + data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0; } #else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); - } + data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); + } #endif - uint4 sum0 = 0, sum1 = 0, sum2 = 0; + uint4 tmp_sum = 0; for(int i=1; i < ksY; i++) { - sum0 += (data[i]); + tmp_sum += (data[i]); } - sum1 = sum0 + (data[0]); - sum2 = sum0 + (data[ksY]); - temp[0][col] = sum1; - temp[1][col] = sum2; + + int index = dst_startY * (dst_step>>2)+ dst_startX + col; + + temp[0][col] = tmp_sum + (data[0]); + temp[1][col] = tmp_sum + (data[ksY]); barrier(CLK_LOCAL_MEM_FENCE); - if(col < (THREADS-(ksX-1))) - { - col += anX; - int posX = dst_startX - dst_x_off + col - anX; - int posY = (gY << 1); + update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); - uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)}; - for(int k=0; k<2; k++) - for(int i=-anX; i<=anX; i++) - { - tmp_sum[k] += temp[k][col+i]; - } - for(int i=0; i<2; i++) - { - if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) - dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha); - } - - } } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////32fC1//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); const int gX = get_group_id(0); @@ -327,7 +336,6 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; float data[ksY+1]; __local float temp[2][THREADS]; #ifdef BORDER_CONSTANT @@ -336,28 +344,25 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); - //ss = src[cur_addr]; int cur_col = clamp(startX + col, 0, src_whole_cols); - //ss = src[(startY+i)*(src_step>>2) + cur_col]; - ss = (startY+i)=0&&cur_col>=0&&cur_col>2) + cur_col]:0; + ss = (startY+i)=0&&cur_col>=0&&cur_col>2) + cur_col]:(float)0; data[i] = con ? ss : 0.f; } #else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - data[i] = src[selected_row * (src_step>>2) + selected_col]; - } + data[i] = src[selected_row * (src_step>>2) + selected_col]; + } #endif float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; @@ -376,7 +381,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float int posX = dst_startX - dst_x_off + col - anX; int posY = (gY << 1); - float tmp_sum[2]={0.0, 0.0}; + float tmp_sum[2]= {0.0, 0.0}; for(int k=0; k<2; k++) for(int i=-anX; i<=anX; i++) { @@ -395,9 +400,9 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float /////////////////////////////////////////32fC4//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); const int gX = get_group_id(0); @@ -412,7 +417,6 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16; float4 data[ksY+1]; __local float4 temp[2][THREADS]; #ifdef BORDER_CONSTANT @@ -421,28 +425,25 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - //int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr); - //ss = src[cur_addr]; int cur_col = clamp(startX + col, 0, src_whole_cols); - //ss = src[(startY+i)*(src_step>>4) + cur_col]; - ss = (startY+i)=0&&cur_col>=0&&cur_col>4) + cur_col]:0; + ss = (startY+i)=0&&cur_col>=0&&cur_col>4) + cur_col]:(float4)0; data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); } #else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - data[i] = src[selected_row * (src_step>>4) + selected_col]; - } + data[i] = src[selected_row * (src_step>>4) + selected_col]; + } #endif float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; @@ -461,7 +462,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa int posX = dst_startX - dst_x_off + col - anX; int posY = (gY << 1); - float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; + float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; for(int k=0; k<2; k++) for(int i=-anX; i<=anX; i++) { diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index a41103a85..833fb3c3f 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade } GpuHidHaarClassifierCascade; -__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade, +__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade( global GpuHidHaarStageClassifier * stagecascadeptr, global int4 * info, global GpuHidHaarTreeNode * nodeptr, @@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa const int splitnode, const int4 p, const int4 pq, - const float correction - //const int width, - //const int height, - //const int grpnumperline, - //const int totalgrp -) + const float correction) { int grpszx = get_local_size(0); int grpszy = get_local_size(1); @@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int lcl_sz = mul24(grpszx,grpszy); int lcl_id = mad24(lclidy,grpszx,lclidx); - //assume lcl_sz == 256 or 128 or 64 - //int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7; - //lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift; __local int lclshare[1024]; - -#define OFF 0 - __local int* lcldata = lclshare + OFF;//for save win data + __local int* lcldata = lclshare;//for save win data __local int* glboutindex = lcldata + 28*28;//for save global out index __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel @@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int totalgrp = scaleinfo1.y & 0xffff; int imgoff = scaleinfo1.z; float factor = as_float(scaleinfo1.w); - //int ystep =1;// factor > 2.0 ? 1 : 2; __global const int * sum = sum1 + imgoff; __global const float * sqsum = sqsum1 + imgoff; @@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int grpidx = grploop - mul24(grpidy, grpnumperline); int x = mad24(grpidx,grpszx,lclidx); int y = mad24(grpidy,grpszy,lclidy); - //candidate_result.x = convert_int_rtn(x*factor); - //candidate_result.y = convert_int_rtn(y*factor); int grpoffx = x-lclidx; int grpoffy = y-lclidy; @@ -207,18 +194,11 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int glb_x = grpoffx + (lcl_x<<2); int glb_y = grpoffy + lcl_y; - int glb_off = mad24(glb_y,pixelstep,glb_x); + int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x); int4 data = *(__global int4*)&sum[glb_off]; int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2); -#if OFF - lcldata[lcl_off] = data.x; - lcldata[lcl_off+1] = data.y; - lcldata[lcl_off+2] = data.z; - lcldata[lcl_off+3] = data.w; -#else vstore4(data, 0, &lcldata[lcl_off]); -#endif } lcloutindex[lcl_id] = 0; @@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa int lcl_off = mad24(lclidy,readwidth,lclidx); int4 cascadeinfo1, cascadeinfo2; cascadeinfo1 = p; - cascadeinfo2 = pq;// + mad24(y, pixelstep, x); + cascadeinfo2 = pq; + cascadeinfo1.x +=lcl_off; + cascadeinfo1.z +=lcl_off; + mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)]) + *correction; - //if((x < width) && (y < height)) + int p_offset = mad24(y, pixelstep, x); + + cascadeinfo2.x +=p_offset; + cascadeinfo2.z +=p_offset; + variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] - + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; + + variance_norm_factor = variance_norm_factor * correction - mean * mean; + variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; + + for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) { - cascadeinfo1.x +=lcl_off; - cascadeinfo1.z +=lcl_off; - mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - - lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)]) - *correction; + float stage_sum = 0.f; + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ ) + { + __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); - int p_offset = mad24(y, pixelstep, x); + int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; - cascadeinfo2.x +=p_offset; - cascadeinfo2.z +=p_offset; - variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] - - sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; + info1.x +=lcl_off; + info1.z +=lcl_off; + info2.x +=lcl_off; + info2.z +=lcl_off; - variance_norm_factor = variance_norm_factor * correction - mean * mean; - variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; - //if( cascade->is_stump_based ) - //{ - for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + + info3.x +=lcl_off; + info3.z +=lcl_off; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; + + stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + nodecounter++; + } + + result = (stage_sum >= stagethreshold); + } + + if(result && (x < width) && (y < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; + lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); + } + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + barrier(CLK_LOCAL_MEM_FENCE); + nodecounter = splitnode; + for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) + { + lclcount[0]=0; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); + float stagethreshold = as_float(stageinfo.y); + + int perfscale = queuecount > 4 ? 3 : 2; + int queuecount_loop = (queuecount + (1<> perfscale; + int lcl_compute_win = lcl_sz >> perfscale; + int lcl_compute_win_id = (lcl_id >>(6-perfscale)); + int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); + int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); + for(int queueloop=0; queueloop>16),readwidth,temp_coord & 0xffff); + + if(lcl_compute_win_id < queuecount) { - __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); - int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; + int tempnodecounter = lcl_compute_id; + float part_sum = 0.f; + for(int lcl_loop=0; lcl_loopp[0][0])); + int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; + info1.x +=queue_pixel; + info1.z +=queue_pixel; + info2.x +=queue_pixel; + info2.z +=queue_pixel; + + float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - + lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - + lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; + info3.x +=queue_pixel; + info3.z +=queue_pixel; + classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - + lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //if((info3.z - info3.x) && (!stageinfo.z)) - //{ - info3.x +=lcl_off; - info3.z +=lcl_off; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //} - stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - nodecounter++; + part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + tempnodecounter +=lcl_compute_win; + }//end for(int lcl_loop=0;lcl_loop= stagethreshold); - } - - if(result && (x < width) && (y < height)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); - } - barrier(CLK_LOCAL_MEM_FENCE); - int queuecount = lclcount[0]; - barrier(CLK_LOCAL_MEM_FENCE); - nodecounter = splitnode; - for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) - { - //barrier(CLK_LOCAL_MEM_FENCE); - //if(lcl_id == 0) - lclcount[0]=0; barrier(CLK_LOCAL_MEM_FENCE); - - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); - - int perfscale = queuecount > 4 ? 3 : 2; - int queuecount_loop = (queuecount + (1<> perfscale; - int lcl_compute_win = lcl_sz >> perfscale; - int lcl_compute_win_id = (lcl_id >>(6-perfscale)); - int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); - int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); - for(int queueloop=0; queueloop>16),readwidth,temp_coord & 0xffff); - - //barrier(CLK_LOCAL_MEM_FENCE); - if(lcl_compute_win_id < queuecount) + for(int i=0; ip[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - - info1.x +=queue_pixel; - info1.z +=queue_pixel; - info2.x +=queue_pixel; - info2.z +=queue_pixel; - - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - - - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; - //if((info3.z - info3.x) && (!stageinfo.z)) - //{ - info3.x +=queue_pixel; - info3.z +=queue_pixel; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - //} - part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - tempnodecounter +=lcl_compute_win; - }//end for(int lcl_loop=0;lcl_loop= stagethreshold && (lcl_compute_id==0)) { - for(int i=0; i= stagethreshold && (lcl_compute_id==0)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = temp_coord; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); - } - lcl_compute_win_id +=(1<0;stageloop++) - //barrier(CLK_LOCAL_MEM_FENCE); - if(lcl_id> 16)); - temp = glboutindex[0]; - int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor*20.f); - candidate_result.x = convert_int_rtn(x*factor); - candidate_result.y = convert_int_rtn(y*factor); - atomic_inc(glboutindex); - candidate[outputoff+temp+lcl_id] = candidate_result; - } + }//end for(int queueloop=0;queueloop0;stageloop++) + + if(lcl_id> 16)); + temp = glboutindex[0]; + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor*20.f); + candidate_result.x = convert_int_rtn(x*factor); + candidate_result.y = convert_int_rtn(y*factor); + atomic_inc(glboutindex); + candidate[outputoff+temp+lcl_id] = candidate_result; + } + barrier(CLK_LOCAL_MEM_FENCE); }//end for(int grploop=grpidx;grploop> 16; + int height = scaleinfo1.x & 0xffff; + int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; + int totalgrp = scaleinfo1.y & 0xffff; + float factor = as_float(scaleinfo1.w); + float correction_t = correction[scalei]; + int ystep = (int)(max(2.0f, factor) + 0.5f); - for (int scalei = 0; scalei < loopcount; scalei++) + for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) { - int4 scaleinfo1; - scaleinfo1 = info[scalei]; - int width = (scaleinfo1.x & 0xffff0000) >> 16; - int height = scaleinfo1.x & 0xffff; - int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; - int totalgrp = scaleinfo1.y & 0xffff; - float factor = as_float(scaleinfo1.w); - float correction_t = correction[scalei]; - int ystep = (int)(max(2.0f, factor) + 0.5f); + int4 cascadeinfo = p[scalei]; + int grpidy = grploop / grpnumperline; + int grpidx = grploop - mul24(grpidy, grpnumperline); + int ix = mad24(grpidx, grpszx, lclidx); + int iy = mad24(grpidy, grpszy, lclidy); + int x = ix * ystep; + int y = iy * ystep; + lcloutindex[lcl_id] = 0; + lclcount[0] = 0; + int nodecounter; + float mean, variance_norm_factor; + //if((ix < width) && (iy < height)) + { + const int p_offset = mad24(y, step, x); + cascadeinfo.x += p_offset; + cascadeinfo.z += p_offset; + mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]) + * correction_t; + variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]; + variance_norm_factor = variance_norm_factor * correction_t - mean * mean; + variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; + bool result = true; + nodecounter = startnode + nodecount * scalei; - for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) + for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) { - int4 cascadeinfo = p[scalei]; - int grpidy = grploop / grpnumperline; - int grpidx = grploop - mul24(grpidy, grpnumperline); - int ix = mad24(grpidx, grpszx, lclidx); - int iy = mad24(grpidy, grpszy, lclidy); - int x = ix * ystep; - int y = iy * ystep; - lcloutindex[lcl_id] = 0; - lclcount[0] = 0; - int result = 1, nodecounter; - float mean, variance_norm_factor; - //if((ix < width) && (iy < height)) - { - const int p_offset = mad24(y, step, x); - cascadeinfo.x += p_offset; - cascadeinfo.z += p_offset; - mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] - - sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)]) - * correction_t; - variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] - - sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)]; - variance_norm_factor = variance_norm_factor * correction_t - mean * mean; - variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; - result = 1; - nodecounter = startnode + nodecount * scalei; - - for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++) - { - float stage_sum = 0.f; - int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop); - float stagethreshold = as_float(stageinfo.y); - - for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++) - { - __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); - int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - info1.x += p_offset; - info1.z += p_offset; - info2.x += p_offset; - info2.z += p_offset; - float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] - - sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x; - classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] - - sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y; - info3.x += p_offset; - info3.z += p_offset; - classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] - - sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z; - stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - nodecounter++; - } - - result = (stage_sum >= stagethreshold); - } - - if (result && (ix < width) && (iy < height)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex << 1] = (y << 16) | x; - lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor); - } - - barrier(CLK_LOCAL_MEM_FENCE); - int queuecount = lclcount[0]; - nodecounter = splitnode + nodecount * scalei; - - if (lcl_id < queuecount) - { - int temp = lcloutindex[lcl_id << 1]; - int x = temp & 0xffff; - int y = (temp & (int)0xffff0000) >> 16; - temp = glboutindex[0]; - int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); - candidate_result.x = x; - candidate_result.y = y; - atomic_inc(glboutindex); - candidate[outputoff + temp + lcl_id] = candidate_result; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } + float stage_sum = 0.f; + int stagecount = stagecascadeptr[stageloop].count; + for (int nodeloop = 0; nodeloop < stagecount; nodeloop++) + { + __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); + int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; + info1.x += p_offset; + info1.z += p_offset; + info2.x += p_offset; + info2.z += p_offset; + float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] - + sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x; + classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] - + sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y; + info3.x += p_offset; + info3.z += p_offset; + classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] - + sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; + stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + nodecounter++; + } + result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold); } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (result && (ix < width) && (iy < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex] = (y << 16) | x; + } + + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + + if (lcl_id < queuecount) + { + int temp = lcloutindex[lcl_id]; + int x = temp & 0xffff; + int y = (temp & (int)0xffff0000) >> 16; + temp = atomic_inc(glboutindex); + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); + candidate_result.x = x; + candidate_result.y = y; + candidate[outputoff + temp + lcl_id] = candidate_result; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } } + } } __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) { - int counter = get_global_id(0); - int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; - GpuHidHaarTreeNode t1 = *(orinode + counter); + int counter = get_global_id(0); + int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; + GpuHidHaarTreeNode t1 = *(orinode + counter); #pragma unroll - for (i = 0; i < 3; i++) - { - tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); - tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); - tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); - tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); - } + for (i = 0; i < 3; i++) + { + tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); + tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); + tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); + tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); + } - t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); - counter += nodenum; + t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); + counter += nodenum; #pragma unroll - for (i = 0; i < 3; i++) - { - newnode[counter].p[i][0] = tr_x[i]; - newnode[counter].p[i][1] = tr_y[i]; - newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; - newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; - newnode[counter].weight[i] = t1.weight[i] * weight_scale; - } + for (i = 0; i < 3; i++) + { + newnode[counter].p[i][0] = tr_x[i]; + newnode[counter].p[i][1] = tr_y[i]; + newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; + newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; + newnode[counter].weight[i] = t1.weight[i] * weight_scale; + } - newnode[counter].left = t1.left; - newnode[counter].right = t1.right; - newnode[counter].threshold = t1.threshold; - newnode[counter].alpha[0] = t1.alpha[0]; - newnode[counter].alpha[1] = t1.alpha[1]; + newnode[counter].left = t1.left; + newnode[counter].right = t1.right; + newnode[counter].threshold = t1.threshold; + newnode[counter].alpha[0] = t1.alpha[0]; + newnode[counter].alpha[1] = t1.alpha[1]; } + diff --git a/modules/ocl/src/opencl/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl index c54695768..d279ef728 100644 --- a/modules/ocl/src/opencl/imgproc_integral.cl +++ b/modules/ocl/src/opencl/imgproc_integral.cl @@ -60,7 +60,7 @@ #define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) -kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float *sqsum, +kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum, int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) { unsigned int lid = get_local_id(0); @@ -159,7 +159,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float } -kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum , +kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum , __global float *sqsum,int rows,int cols,int src_step,int sum_step, int sqsum_step,int sum_offset,int sqsum_offset) { @@ -275,3 +275,219 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo barrier(CLK_LOCAL_MEM_FENCE); } } + +kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum, + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + float4 sqsum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; + __local float* sum_p; + __local float* sqsum_p; + src_step = src_step >> 2; + gid = gid << 1; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0); + src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0); + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); + + lm_sum[1][bf_loc] = src_t[1]; + lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; + if(lid > 0 && (i+lid) <= rows) + { + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + lm_sqsum[0][bf_loc] += sqsum_t[0]; + lm_sqsum[1][bf_loc] += sqsum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; + sum[loc_s0 + k * dst_step / 4] = sum_p[k]; + sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k + 4 >= cols + pre_invalid) break; + sum[loc_s1 + k * dst_step / 4] = sum_p[k]; + sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + + +kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum , + __global float *sqsum,int rows,int cols,int src_step,int sum_step, + int sqsum_step,int sum_offset,int sqsum_offset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + float4 sqsrc_t[2],sqsum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; + __local float *sum_p; + __local float *sqsum_p; + src_step = src_step >> 4; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; + sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + lm_sqsum[0][bf_loc] = sqsrc_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + lm_sqsum[1][bf_loc] = sqsrc_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + + lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; + lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(gid == 0 && (i + lid) <= rows) + { + sum[sum_offset + i + lid] = 0; + sqsum[sqsum_offset + i + lid] = 0; + } + if(i + lid == 0) + { + int loc0 = gid * 2 * sum_step; + int loc1 = gid * 2 * sqsum_step; + for(int k = 1; k <= 8; k++) + { + if(gid * 8 + k > cols) break; + sum[sum_offset + loc0 + k * sum_step / 4] = 0; + sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; + } + } + int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; + int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; + if(lid > 0 && (i+lid) <= rows) + { + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + lm_sqsum[0][bf_loc] += sqsum_t[0]; + lm_sqsum[1][bf_loc] += sqsum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + k >= cols) break; + sum[loc_s0 + k * sum_step / 4] = sum_p[k]; + sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + 4 + k >= cols) break; + sum[loc_s1 + k * sum_step / 4] = sum_p[k]; + sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} \ No newline at end of file diff --git a/modules/ocl/src/opencl/imgproc_integral_sum.cl b/modules/ocl/src/opencl/imgproc_integral_sum.cl index b7b3f2ff0..70f0c63df 100644 --- a/modules/ocl/src/opencl/imgproc_integral_sum.cl +++ b/modules/ocl/src/opencl/imgproc_integral_sum.cl @@ -44,8 +44,13 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable #endif +#endif + #define LSIZE 256 #define LSIZE_1 255 #define LSIZE_2 254 @@ -56,8 +61,8 @@ #define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) -kernel void integral_sum_cols(__global uchar4 *src,__global int *sum , - int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum , + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) { unsigned int lid = get_local_id(0); unsigned int gid = get_group_id(0); @@ -114,7 +119,8 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum , } } barrier(CLK_LOCAL_MEM_FENCE); - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; @@ -136,9 +142,9 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum , } -kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum , - int rows,int cols,int src_step,int sum_step, - int sum_offset) +kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum , + int rows,int cols,int src_step,int sum_step, + int sum_offset) { unsigned int lid = get_local_id(0); unsigned int gid = get_group_id(0); @@ -196,19 +202,20 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum , barrier(CLK_LOCAL_MEM_FENCE); if(gid == 0 && (i + lid) <= rows) { - sum[sum_offset + i + lid] = 0; + sum[sum_offset + i + lid] = 0; } if(i + lid == 0) { int loc0 = gid * 2 * sum_step; - for(int k = 1;k <= 8;k++) + for(int k = 1; k <= 8; k++) { if(gid * 8 + k > cols) break; sum[sum_offset + loc0 + k * sum_step / 4] = 0; } } - if(lid > 0 && (i+lid) <= rows){ + if(lid > 0 && (i+lid) <= rows) + { int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; lm_sum[0][bf_loc] += sum_t[0]; lm_sum[1][bf_loc] += sum_t[1]; @@ -228,3 +235,178 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum , barrier(CLK_LOCAL_MEM_FENCE); } } + +kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum , + int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float* sum_p; + src_step = src_step >> 2; + gid = gid << 1; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0); + src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0); + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid > 0 && (i+lid) <= rows) + { + int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; + sum[loc_s0 + k * dst_step / 4] = sum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 4 + k + 4 >= cols + pre_invalid) break; + sum[loc_s1 + k * dst_step / 4] = sum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + + +kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum , + int rows,int cols,int src_step,int sum_step, + int sum_offset) +{ + unsigned int lid = get_local_id(0); + unsigned int gid = get_group_id(0); + float4 src_t[2], sum_t[2]; + __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; + __local float *sum_p; + src_step = src_step >> 4; + for(int i = 0; i < rows; i =i + LSIZE_1) + { + src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; + src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; + + sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); + sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); + barrier(CLK_LOCAL_MEM_FENCE); + + int bf_loc = lid + GET_CONFLICT_OFFSET(lid); + lm_sum[0][bf_loc] = src_t[0]; + + lm_sum[1][bf_loc] = src_t[1]; + + int offset = 1; + for(int d = LSIZE >> 1 ; d > 0; d>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + } + offset <<= 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < 2) + { + lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; + } + for(int d = 1; d < LSIZE; d <<= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + offset >>= 1; + int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; + ai += GET_CONFLICT_OFFSET(ai); + bi += GET_CONFLICT_OFFSET(bi); + + if((lid & 127) < d) + { + lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; + lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(gid == 0 && (i + lid) <= rows) + { + sum[sum_offset + i + lid] = 0; + } + if(i + lid == 0) + { + int loc0 = gid * 2 * sum_step; + for(int k = 1; k <= 8; k++) + { + if(gid * 8 + k > cols) break; + sum[sum_offset + loc0 + k * sum_step / 4] = 0; + } + } + + if(lid > 0 && (i+lid) <= rows) + { + int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; + lm_sum[0][bf_loc] += sum_t[0]; + lm_sum[1][bf_loc] += sum_t[1]; + sum_p = (__local float*)(&(lm_sum[0][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + k >= cols) break; + sum[loc_s0 + k * sum_step / 4] = sum_p[k]; + } + sum_p = (__local float*)(&(lm_sum[1][bf_loc])); + for(int k = 0; k < 4; k++) + { + if(gid * 8 + 4 + k >= cols) break; + sum[loc_s1 + k * sum_step / 4] = sum_p[k]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl index cee88a05e..c86ae494d 100644 --- a/modules/ocl/src/opencl/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -1,3 +1,48 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Sen Liu, swjtuls1987@126.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + #if defined (DOUBLE_SUPPORT) #ifdef cl_khr_fp64 @@ -609,22 +654,33 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols int y = wgidy*TILE_SIZE; // real Y index of pixel int x = wgidx*TILE_SIZE; // real X index of pixel int kcn = (cn==2)?2:4; - int rstep = min(src_step/4, TILE_SIZE); + src_step /= sizeof(*src_data); + int rstep = min(src_step, TILE_SIZE); tileSize_height = min(TILE_SIZE, src_rows - y); tileSize_width = min(TILE_SIZE, src_cols -x); - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep; i++ ) - *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; + int maxIdx = mul24(src_rows, src_cols); + int yOff = (y+lidy)*src_step; + int index; + if(tileSize_width < TILE_SIZE && yOff < src_rows) + for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ ) + *(src_data+yOff+x+i) = 0; if( coi > 0 ) for(int i=0; i < tileSize_width; i+=VLEN_F) { +#pragma unroll for(int j=0; j<4; j++) - tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); + { + index = yOff+(x+i+j)*kcn+coi-1; + if (index < maxIdx) + tmp_coi[j] = *(src_data+index); + else + tmp_coi[j] = 0; + } tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); } else - for(int i=0; i < tileSize_width; i+=VLEN_F) - tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); + for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F) + tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i)); float4 zero = (float4)(0); float4 full = (float4)(255); if( binary ) @@ -714,35 +770,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols // accumulate moments computed in each tile dst_step /= sizeof(F); + int dst_x_off = mad24(wgidy, dst_cols, wgidx); + int dst_off = 0; + int max_dst_index = 10 * blocky * get_global_size(1); + // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; + dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[0]; // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; + dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[1] + xm; // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; + dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[2] + ym; // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); + dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm); // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; + dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1]; // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); + dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym); // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); + dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; + dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; + dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); + dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off); + if (dst_off < max_dst_index) + *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); } } diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl index 0b7f0c902..ef41c9408 100644 --- a/modules/ocl/src/opencl/pyr_up.cl +++ b/modules/ocl/src/opencl/pyr_up.cl @@ -18,6 +18,7 @@ // Zhang Chunpeng chunpeng@multicorewareinc.com // Dachuan Zhao, dachuan@multicorewareinc.com // Yao Wang, yao@multicorewareinc.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -47,7 +48,7 @@ //#pragma OPENCL EXTENSION cl_amd_printf : enable -uchar get_valid_uchar(uchar data) +uchar get_valid_uchar(float data) { return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0); } @@ -142,7 +143,7 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = (float)(4.0f * sum); + dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum); } @@ -244,7 +245,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = (float)(4.0f * sum); + dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum); } @@ -351,31 +352,6 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_8UC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// -float4 covert_uchar4_to_float4(uchar4 data) -{ - float4 f4Data = {0,0,0,0}; - - f4Data.x = (float)data.x; - f4Data.y = (float)data.y; - f4Data.z = (float)data.z; - f4Data.w = (float)data.w; - - return f4Data; -} - - -uchar4 convert_float4_to_uchar4(float4 data) -{ - uchar4 u4Data; - - u4Data.x = get_valid_uchar(data.x); - u4Data.y = get_valid_uchar(data.y); - u4Data.z = get_valid_uchar(data.z); - u4Data.w = get_valid_uchar(data.w); - - return u4Data; -} - __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, int srcRows,int dstRows,int srcCols,int dstCols, int srcOffset,int dstOffset,int srcStep,int dstStep) @@ -406,7 +382,7 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]); + s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -476,38 +452,12 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, if ((x < dstCols) && (y < dstRows)) { - dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum); + dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum); } } /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_16UC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// -float4 covert_ushort4_to_float4(ushort4 data) -{ - float4 f4Data = {0,0,0,0}; - - f4Data.x = (float)data.x; - f4Data.y = (float)data.y; - f4Data.z = (float)data.z; - f4Data.w = (float)data.w; - - return f4Data; -} - - -ushort4 convert_float4_to_ushort4(float4 data) -{ - ushort4 u4Data; - - u4Data.x = (float)data.x; - u4Data.y = (float)data.y; - u4Data.z = (float)data.z; - u4Data.w = (float)data.w; - - return u4Data; -} - - __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, int srcRows,int dstRows,int srcCols,int dstCols, int srcOffset,int dstOffset,int srcStep,int dstStep) @@ -535,7 +485,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, srcy = abs(srcy); srcy = min(srcRows -1 ,srcy); - s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]); + s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -570,11 +520,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, if (eveny) { - sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; } s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; @@ -610,7 +560,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, if ((x < dstCols) && (y < dstRows)) { - dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum); + dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum); } } @@ -681,11 +631,11 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, if (eveny) { - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)]; + sum = sum + (oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)]; sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)]; - sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; - sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)]; } s_dstPatch[tidy][tidx] = sum; diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 196a786d5..bd86a7f3f 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -16,6 +16,8 @@ // // @Authors // Jia Haipeng, jiahaipeng95@gmail.com +// Sen Liu, swjtuls1987@126.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -50,59 +52,40 @@ #define STEREO_MIND 0 // The minimum d range to check #define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing -int SQ(int a) -{ - return a * a; -} +#ifndef radius +#define radius 64 +#endif -unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, - volatile __local unsigned int *col_ssd, int radius) +unsigned int CalcSSD(__local unsigned int *col_ssd) { - unsigned int cache = 0; - unsigned int cache2 = 0; + unsigned int cache = col_ssd[0]; - for(int i = 1; i <= radius; i++) +#pragma unroll + for(int i = 1; i <= (radius << 1); i++) cache += col_ssd[i]; - col_ssd_cache[0] = cache; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < BLOCK_W - radius) - cache2 = col_ssd_cache[radius]; - else - for(int i = radius + 1; i < (2 * radius + 1); i++) - cache2 += col_ssd[i]; - - return col_ssd[0] + cache + cache2; + return cache; } -uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, - volatile __local unsigned int *col_ssd, int radius) +uint2 MinSSD(__local unsigned int *col_ssd) { unsigned int ssd[N_DISPARITIES]; + const int win_size = (radius << 1); - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); + //See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE) + ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + win_size)); + ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + win_size)); + ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + win_size)); + ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + win_size)); + ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + win_size)); + ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + win_size)); + ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + win_size)); + ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + win_size)); unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); int bestIdx = 0; + for (int i = 0; i < N_DISPARITIES; i++) { if (mssd == ssd[i]) @@ -113,124 +96,66 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, } void StepDown(int idx1, int idx2, __global unsigned char* imageL, - __global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius) + __global unsigned char* imageR, int d, __local unsigned int *col_ssd) { - unsigned char leftPixel1; - unsigned char leftPixel2; - unsigned char rightPixel1[8]; - unsigned char rightPixel2[8]; - unsigned int diff1, diff2; - - leftPixel1 = imageL[idx1]; - leftPixel2 = imageL[idx2]; - - idx1 = idx1 - d; - idx2 = idx2 - d; - - rightPixel1[7] = imageR[idx1 - 7]; - rightPixel1[0] = imageR[idx1 - 0]; - rightPixel1[1] = imageR[idx1 - 1]; - rightPixel1[2] = imageR[idx1 - 2]; - rightPixel1[3] = imageR[idx1 - 3]; - rightPixel1[4] = imageR[idx1 - 4]; - rightPixel1[5] = imageR[idx1 - 5]; - rightPixel1[6] = imageR[idx1 - 6]; - - rightPixel2[7] = imageR[idx2 - 7]; - rightPixel2[0] = imageR[idx2 - 0]; - rightPixel2[1] = imageR[idx2 - 1]; - rightPixel2[2] = imageR[idx2 - 2]; - rightPixel2[3] = imageR[idx2 - 3]; - rightPixel2[4] = imageR[idx2 - 4]; - rightPixel2[5] = imageR[idx2 - 5]; - rightPixel2[6] = imageR[idx2 - 6]; - - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - diff1 = leftPixel1 - rightPixel1[0]; - diff2 = leftPixel2 - rightPixel2[0]; - col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[1]; - diff2 = leftPixel2 - rightPixel2[1]; - col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[2]; - diff2 = leftPixel2 - rightPixel2[2]; - col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[3]; - diff2 = leftPixel2 - rightPixel2[3]; - col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[4]; - diff2 = leftPixel2 - rightPixel2[4]; - col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[5]; - diff2 = leftPixel2 - rightPixel2[5]; - col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[6]; - diff2 = leftPixel2 - rightPixel2[6]; - col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[7]; - diff2 = leftPixel2 - rightPixel2[7]; - col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); + uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7))); + uint8 imgR2 = convert_uint8(vload8(0, imageR + (idx2 - d - 7))); + uint8 diff1 = (uint8)(imageL[idx1]) - imgR1; + uint8 diff2 = (uint8)(imageL[idx2]) - imgR2; + uint8 res = diff2 * diff2 - diff1 * diff1; + const int win_size = (radius << 1); + col_ssd[0 * (BLOCK_W + win_size)] += res.s7; + col_ssd[1 * (BLOCK_W + win_size)] += res.s6; + col_ssd[2 * (BLOCK_W + win_size)] += res.s5; + col_ssd[3 * (BLOCK_W + win_size)] += res.s4; + col_ssd[4 * (BLOCK_W + win_size)] += res.s3; + col_ssd[5 * (BLOCK_W + win_size)] += res.s2; + col_ssd[6 * (BLOCK_W + win_size)] += res.s1; + col_ssd[7 * (BLOCK_W + win_size)] += res.s0; } void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, __global unsigned char* imageR, int d, - volatile __local unsigned int *col_ssd, int radius) + __local unsigned int *col_ssd) { - unsigned char leftPixel1; - int idx; - unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0}; - - for(int i = 0; i < (2 * radius + 1); i++) + uint8 leftPixel1; + uint8 diffa = 0; + int idx = y_tex * im_pitch + x_tex; + const int win_size = (radius << 1); + for(int i = 0; i < (win_size + 1); i++) { - idx = y_tex * im_pitch + x_tex; - leftPixel1 = imageL[idx]; - idx = idx - d; + leftPixel1 = (uint8)(imageL[idx]); + uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7))); + uint8 res = leftPixel1 - imgR; + diffa += res * res; - diffa[0] += SQ(leftPixel1 - imageR[idx - 0]); - diffa[1] += SQ(leftPixel1 - imageR[idx - 1]); - diffa[2] += SQ(leftPixel1 - imageR[idx - 2]); - diffa[3] += SQ(leftPixel1 - imageR[idx - 3]); - diffa[4] += SQ(leftPixel1 - imageR[idx - 4]); - diffa[5] += SQ(leftPixel1 - imageR[idx - 5]); - diffa[6] += SQ(leftPixel1 - imageR[idx - 6]); - diffa[7] += SQ(leftPixel1 - imageR[idx - 7]); - - y_tex += 1; + idx += im_pitch; } - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0]; - col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1]; - col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2]; - col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3]; - col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4]; - col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5]; - col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6]; - col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7]; + //See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE) + col_ssd[0 * (BLOCK_W + win_size)] = diffa.s7; + col_ssd[1 * (BLOCK_W + win_size)] = diffa.s6; + col_ssd[2 * (BLOCK_W + win_size)] = diffa.s5; + col_ssd[3 * (BLOCK_W + win_size)] = diffa.s4; + col_ssd[4 * (BLOCK_W + win_size)] = diffa.s3; + col_ssd[5 * (BLOCK_W + win_size)] = diffa.s2; + col_ssd[6 * (BLOCK_W + win_size)] = diffa.s1; + col_ssd[7 * (BLOCK_W + win_size)] = diffa.s0; } __kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right, __global unsigned int *cminSSDImage, int cminSSD_step, __global unsigned char *disp, int disp_step,int cwidth, int cheight, - int img_step, int maxdisp, int radius, + int img_step, int maxdisp, __local unsigned int *col_ssd_cache) { - - volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0); - volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; + __local unsigned int *col_ssd = col_ssd_cache + get_local_id(0); + __local unsigned int *col_ssd_extra = get_local_id(0) < (radius << 1) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; - // int Y = get_group_id(1) * ROWSperTHREAD + radius; #define Y (get_group_id(1) * ROWSperTHREAD + radius) - volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; + __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y); @@ -244,14 +169,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char { y_tex = Y - radius; - InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius); + InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd); if (col_ssd_extra > 0) if (x_tex + BLOCK_W < cwidth) - InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius); + InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); + uint2 minSSD = MinSSD(col_ssd); if (X < cwidth - radius && Y < cheight - radius) { if (minSSD.x < minSSDImage[0]) @@ -264,21 +189,18 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char for(int row = 1; row < end_row; row++) { int idx1 = y_tex * img_step + x_tex; - int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex; - - barrier(CLK_GLOBAL_MEM_FENCE); + int idx2 = min(y_tex + ((radius << 1) + 1), cheight - 1) * img_step + x_tex; + barrier(CLK_LOCAL_MEM_FENCE); - StepDown(idx1, idx2, left, right, d, col_ssd, radius); + StepDown(idx1, idx2, left, right, d, col_ssd); if (col_ssd_extra > 0) if (x_tex + BLOCK_W < cwidth) - StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius); - - y_tex += 1; + StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra); barrier(CLK_LOCAL_MEM_FENCE); - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); + uint2 minSSD = MinSSD(col_ssd); if (X < cwidth - radius && row < cheight - radius - Y) { int idx = row * cminSSD_step; @@ -288,10 +210,11 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char minSSDImage[idx] = minSSD.x; } } + + y_tex++; } // for row loop } // for d loop } - ////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////// Sobel Prefiler (signal channel)////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/modules/ocl/src/opencl/stereobp.cl b/modules/ocl/src/opencl/stereobp.cl index 3993acae0..03e42876e 100644 --- a/modules/ocl/src/opencl/stereobp.cl +++ b/modules/ocl/src/opencl/stereobp.cl @@ -185,10 +185,10 @@ __kernel void data_step_down(__global T *src, int src_rows, for (int d = 0; d < cndisp; ++d) { float dst_reg; - dst_reg = src[(d * src_rows + (2*y+0)) * src_step + 2*x+0]; - dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+0]; - dst_reg += src[(d * src_rows + (2*y+0)) * src_step + 2*x+1]; - dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+1]; + dst_reg = src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+0]; + dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+0]; + dst_reg += src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+1]; + dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+1]; dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg); } diff --git a/modules/ocl/src/opencl/stereocsbp.cl b/modules/ocl/src/opencl/stereocsbp.cl new file mode 100644 index 000000000..ea7af62b2 --- /dev/null +++ b/modules/ocl/src/opencl/stereocsbp.cl @@ -0,0 +1,1402 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// Jin Ma, jin@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#ifndef FLT_MAX +#define FLT_MAX CL_FLT_MAX +#endif + +#ifndef SHRT_MAX +#define SHRT_MAX CL_SHORT_MAX +#endif + + +/////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////get_first_k_initial_global////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void get_first_k_initial_global_0(__global short *data_cost_selected_, __global short *selected_disp_pyr, + __global short *ctemp, int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global short *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global short *data_cost = ctemp + y * cmsg_step1 + x; + + for(int i = 0; i < nr_plane; i++) + { + short minimum = SHRT_MAX; + int id = 0; + + for(int d = 0; d < cndisp; d++) + { + short cur = data_cost[d * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost [id * cdisp_step1] = SHRT_MAX; + } + } +} +__kernel void get_first_k_initial_global_1(__global float *data_cost_selected_, __global float *selected_disp_pyr, + __global float *ctemp, int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global float *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global float *data_cost = ctemp + y * cmsg_step1 + x; + + for(int i = 0; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for(int d = 0; d < cndisp; d++) + { + float cur = data_cost[d * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost [id * cdisp_step1] = FLT_MAX; + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////get_first_k_initial_local//////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void get_first_k_initial_local_0(__global short *data_cost_selected_, __global short *selected_disp_pyr, + __global short *ctemp,int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global short *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global short *data_cost = ctemp + y * cmsg_step1 + x; + + int nr_local_minimum = 0; + + short prev = data_cost[0 * cdisp_step1]; + short cur = data_cost[1 * cdisp_step1]; + short next = data_cost[2 * cdisp_step1]; + + for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++) + { + + if (cur < prev && cur < next) + { + data_cost_selected[nr_local_minimum * cdisp_step1] = cur; + selected_disparity[nr_local_minimum * cdisp_step1] = d; + data_cost[d * cdisp_step1] = SHRT_MAX; + + nr_local_minimum++; + } + + prev = cur; + cur = next; + next = data_cost[(d + 1) * cdisp_step1]; + } + + for (int i = nr_local_minimum; i < nr_plane; i++) + { + short minimum = SHRT_MAX; + int id = 0; + + for (int d = 0; d < cndisp; d++) + { + cur = data_cost[d * cdisp_step1]; + if (cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost[id * cdisp_step1] = SHRT_MAX; + } + } +} + +__kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, __global float *selected_disp_pyr, + __global float *ctemp,int h, int w, int nr_plane, + int cmsg_step1, int cdisp_step1, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global float *selected_disparity = selected_disp_pyr + y * cmsg_step1 + x; + __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global float *data_cost = ctemp + y * cmsg_step1 + x; + + int nr_local_minimum = 0; + + float prev = data_cost[0 * cdisp_step1]; + float cur = data_cost[1 * cdisp_step1]; + float next = data_cost[2 * cdisp_step1]; + + for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++) + { + if (cur < prev && cur < next) + { + data_cost_selected[nr_local_minimum * cdisp_step1] = cur; + selected_disparity[nr_local_minimum * cdisp_step1] = d; + data_cost[d * cdisp_step1] = FLT_MAX ; + + nr_local_minimum++; + } + + prev = cur; + cur = next; + next = data_cost[(d + 1) * cdisp_step1]; + } + + + for (int i = nr_local_minimum; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for (int d = 0; d < cndisp; d++) + { + cur = data_cost[d * cdisp_step1]; + if (cur < minimum) + { + minimum = cur; + id = d; + } + } + + data_cost_selected[i * cdisp_step1] = minimum; + selected_disparity[i * cdisp_step1] = id; + data_cost[id * cdisp_step1] = FLT_MAX; + } + } +} + +/////////////////////////////////////////////////////////////// +/////////////////////// init data cost //////////////////////// +/////////////////////////////////////////////////////////////// +float compute_3(__global uchar* left, __global uchar* right, + float cdata_weight, float cmax_data_term) +{ + float tb = 0.114f * abs((int)left[0] - right[0]); + float tg = 0.587f * abs((int)left[1] - right[1]); + float tr = 0.299f * abs((int)left[2] - right[2]); + + return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term); +} +float compute_1(__global uchar* left, __global uchar* right, + float cdata_weight, float cmax_data_term) +{ + return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term); +} +short round_short(float v){ + return convert_short_sat_rte(v); +} +/////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////init_data_cost/////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int channels, + int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1, + int cth, int cimg_step, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global short *data_cost = ctemp + y * cmsg_step1 + x; + + for(int d = 0; d < cndisp; ++d) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int xr = xi - d; + if(d < cth || xr < 0) + val += cdata_weight * cmax_data_term; + else + { + __global uchar *lle = cleft + yi * cimg_step + xi * channels; + __global uchar *lri = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = round_short(val); + } + } +} +__kernel void init_data_cost_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int channels, + int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1, + int cth, int cimg_step, int cndisp) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global float *data_cost = ctemp + y * cmsg_step1 + x; + + for(int d = 0; d < cndisp; ++d) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int xr = xi - d; + if(d < cth || xr < 0) + val += cdata_weight * cmax_data_term; + else + { + __global uchar* lle = cleft + yi * cimg_step + xi * channels; + __global uchar* lri = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = val; + } + } +} +//////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////init_data_cost_reduce////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void init_data_cost_reduce_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright, + __local float *smem, int level, int rows, int cols, int h, int winsz, int channels, + int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth, + int cdisp_step1, int cmsg_step1) +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + //int d = (blockIdx.y / h) * blockDim.z + threadIdx.z; + int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + if (d < cndisp) + { + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + if (x0 + tid < cols) + { + if (x0 + tid - d < 0 || d < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local float* dline = smem + winsz * get_local_id(2); + if (winsz >= 256) + { + if (tid < 128) + dline[tid] += dline[tid + 128]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local float* dline = smem + winsz * get_local_id(2); + if (winsz >= 128) + { + if (tid < 64) + dline[tid] += dline[tid + 64]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 64) + if (tid < 32) + vdline[tid] += vdline[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 32) + if (tid < 16) + vdline[tid] += vdline[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d= 16) + if (tid < 8) + vdline[tid] += vdline[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d= 8) + if (tid < 4) + vdline[tid] += vdline[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d= 4) + if (tid < 2) + vdline[tid] += vdline[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d= 2) + if (tid < 1) + vdline[tid] += vdline[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local float* dline = smem + winsz * get_local_id(2); + __global short* data_cost = ctemp + y_out * cmsg_step1 + x_out; + if (tid == 0) + data_cost[cdisp_step1 * d] = convert_short_sat_rte(dline[0]); + } +} + +__kernel void init_data_cost_reduce_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright, + __local float *smem, int level, int rows, int cols, int h, int winsz, int channels, + int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth, + int cdisp_step1, int cmsg_step1) +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + if (d < cndisp) + { + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + //float val = 528.0f; + + if (x0 + tid < cols) + { + if (x0 + tid - d < 0 || d < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local float* dline = smem + winsz * get_local_id(2); + if (winsz >= 256) + if (tid < 128) + dline[tid] += dline[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local float* dline = smem + winsz * get_local_id(2); + if (winsz >= 128) + if (tid < 64) + dline[tid] += dline[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 64) + if (tid < 32) + vdline[tid] += vdline[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 32) + if (tid < 16) + vdline[tid] += vdline[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 16) + if (tid < 8) + vdline[tid] += vdline[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 8) + if (tid < 4) + vdline[tid] += vdline[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 4) + if (tid < 2) + vdline[tid] += vdline[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 2) + if (tid < 1) + vdline[tid] += vdline[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < cndisp) + { + __global float *data_cost = ctemp + y_out * cmsg_step1 + x_out; + __local float* dline = smem + winsz * get_local_id(2); + if (tid == 0) + data_cost[cdisp_step1 * d] = dline[0]; + } +} + +/////////////////////////////////////////////////////////////// +////////////////////// compute data cost ////////////////////// +/////////////////////////////////////////////////////////////// +__kernel void compute_data_cost_0(__global const short *selected_disp_pyr, __global short *data_cost_, + __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int nr_plane, int channels, + int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight, + float cmax_data_term, int cimg_step, int cth) +{ + + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global const short *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2; + __global short *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane; d++) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + int xr = xi - sel_disp; + + if (xr < 0 || sel_disp < cth) + val += cdata_weight * cmax_data_term; + + else + { + __global uchar* left_x = cleft + yi * cimg_step + xi * channels; + __global uchar* right_x = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(left_x, right_x, cdata_weight, cmax_data_term); + else + val += compute_3(left_x, right_x, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = convert_short_sat_rte(val); + } + } +} +__kernel void compute_data_cost_1(__global const float *selected_disp_pyr, __global float *data_cost_, + __global uchar *cleft, __global uchar *cright, + int h, int w, int level, int nr_plane, int channels, + int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight, + float cmax_data_term, int cimg_step, int cth) +{ + + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + int y0 = y << level; + int yt = (y + 1) << level; + + int x0 = x << level; + int xt = (x + 1) << level; + + __global const float *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2; + __global float *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane; d++) + { + float val = 0.0f; + for(int yi = y0; yi < yt; yi++) + { + for(int xi = x0; xi < xt; xi++) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + int xr = xi - sel_disp; + + if (xr < 0 || sel_disp < cth) + val += cdata_weight * cmax_data_term; + else + { + __global uchar* left_x = cleft + yi * cimg_step + xi * channels; + __global uchar* right_x = cright + yi * cimg_step + xr * channels; + + if(channels == 1) + val += compute_1(left_x, right_x, cdata_weight, cmax_data_term); + else + val += compute_3(left_x, right_x, cdata_weight, cmax_data_term); + } + } + } + data_cost[cdisp_step1 * d] = val; + } + } +} +//////////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////compute_data_cost_reduce////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////////////// +__kernel void compute_data_cost_reduce_0(__global const short* selected_disp_pyr, __global short* data_cost_, + __global uchar *cleft, __global uchar *cright,__local float *smem, + int level, int rows, int cols, int h, int nr_plane, + int channels, int winsz, + int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, + float cdata_weight, float cmax_data_term, int cimg_step,int cth) + +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + __global const short* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2; + __global short* data_cost = data_cost_ + y_out * cmsg_step1 + x_out; + + if (d < nr_plane) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + if (x0 + tid < cols) + { + if (x0 + tid - sel_disp < 0 || sel_disp < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + // if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); } + //if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); } + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 64) + { + if (tid < 32) + vdline[tid] += vdline[tid + 32]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 32) + { + if (tid < 16) + vdline[tid] += vdline[tid + 16]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 16) + { + if (tid < 8) + vdline[tid] += vdline[tid + 8]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 8) + { + if (tid < 4) + vdline[tid] += vdline[tid + 4]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 4) + { + if (tid < 2) + vdline[tid] += vdline[tid + 2]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 2) + { + if (tid < 1) + vdline[tid] += vdline[tid + 1]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (tid == 0) + data_cost[cdisp_step1 * d] = convert_short_sat_rte(vdline[0]); + } +} + +__kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr, __global float *data_cost_, + __global uchar *cleft, __global uchar *cright, __local float *smem, + int level, int rows, int cols, int h, int nr_plane, + int channels, int winsz, + int cmsg_step1, int cmsg_step2, int cdisp_step1,int cdisp_step2, float cdata_weight, + float cmax_data_term, int cimg_step, int cth) + +{ + int x_out = get_group_id(0); + int y_out = get_group_id(1) % h; + int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2); + + int tid = get_local_id(0); + + __global const float *selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2; + __global float *data_cost = data_cost_ + y_out * cmsg_step1 + x_out; + + if (d < nr_plane) + { + int sel_disp = selected_disparity[d * cdisp_step2]; + + int x0 = x_out << level; + int y0 = y_out << level; + + int len = min(y0 + winsz, rows) - y0; + + float val = 0.0f; + if (x0 + tid < cols) + { + if (x0 + tid - sel_disp < 0 || sel_disp < cth) + val = cdata_weight * cmax_data_term * len; + else + { + __global uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid ); + __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp); + + for(int y = 0; y < len; ++y) + { + if(channels == 1) + val += compute_1(lle, lri, cdata_weight, cmax_data_term); + else + val += compute_3(lle, lri, cdata_weight, cmax_data_term); + + lle += cimg_step; + lri += cimg_step; + } + } + } + + __local float* dline = smem + winsz * get_local_id(2); + + dline[tid] = val; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 64) + { + if (tid < 32) + vdline[tid] += vdline[tid + 32]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 32) + { + if (tid < 16) + vdline[tid] += vdline[tid + 16]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 16) + { + if (tid < 8) + vdline[tid] += vdline[tid + 8]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 8) + { + if (tid < 4) + vdline[tid] += vdline[tid + 4]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 4) + { + if (tid < 2) + vdline[tid] += vdline[tid + 2]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (winsz >= 2) + { + if (tid < 1) + vdline[tid] += vdline[tid + 1]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(d < nr_plane) + { + __local volatile float* vdline = smem + winsz * get_local_id(2); + if (tid == 0) + data_cost[cdisp_step1 * d] = vdline[0]; + } +} + +/////////////////////////////////////////////////////////////// +//////////////////////// init message ///////////////////////// +/////////////////////////////////////////////////////////////// +void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new, + __global short *r_new, __global const short *u_cur, __global const short *d_cur, + __global const short *l_cur, __global const short *r_cur, + __global short *data_cost_selected, __global short *disparity_selected_new, + __global short *data_cost_new, __global const short* data_cost_cur, + __global const short *disparity_selected_cur, + int nr_plane, int nr_plane2, + int cdisp_step1, int cdisp_step2) +{ + for(int i = 0; i < nr_plane; i++) + { + short minimum = SHRT_MAX; + int id = 0; + for(int j = 0; j < nr_plane2; j++) + { + short cur = data_cost_new[j * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = j; + } + } + + data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1]; + disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2]; + + u_new[i * cdisp_step1] = u_cur[id * cdisp_step2]; + d_new[i * cdisp_step1] = d_cur[id * cdisp_step2]; + l_new[i * cdisp_step1] = l_cur[id * cdisp_step2]; + r_new[i * cdisp_step1] = r_cur[id * cdisp_step2]; + + data_cost_new[id * cdisp_step1] = SHRT_MAX; + } +} +void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new, + __global float *r_new, __global const float *u_cur, __global const float *d_cur, + __global const float *l_cur, __global const float *r_cur, + __global float *data_cost_selected, __global float *disparity_selected_new, + __global float *data_cost_new, __global const float *data_cost_cur, + __global const float *disparity_selected_cur, + int nr_plane, int nr_plane2, + int cdisp_step1, int cdisp_step2) +{ + for(int i = 0; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for(int j = 0; j < nr_plane2; j++) + { + float cur = data_cost_new[j * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = j; + } + } + + data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1]; + disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2]; + + u_new[i * cdisp_step1] = u_cur[id * cdisp_step2]; + d_new[i * cdisp_step1] = d_cur[id * cdisp_step2]; + l_new[i * cdisp_step1] = l_cur[id * cdisp_step2]; + r_new[i * cdisp_step1] = r_cur[id * cdisp_step2]; + data_cost_new[id * cdisp_step1] = FLT_MAX; + + } +} +__kernel void init_message_0(__global short *u_new_, __global short *d_new_, __global short *l_new_, + __global short *r_new_, __global short *u_cur_, __global const short *d_cur_, + __global const short *l_cur_, __global const short *r_cur_, __global short *ctemp, + __global short *selected_disp_pyr_new, __global const short *selected_disp_pyr_cur, + __global short *data_cost_selected_, __global const short *data_cost_, + int h, int w, int nr_plane, int h2, int w2, int nr_plane2, + int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < h && x < w) + { + __global const short *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2; + __global const short *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2; + __global const short *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1); + __global const short *r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1); + + __global short *data_cost_new = ctemp + y * cmsg_step1 + x; + + __global const short *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2; + __global const short *data_cost = data_cost_ + y * cmsg_step1 + x; + + for(int d = 0; d < nr_plane2; d++) + { + int idx2 = d * cdisp_step2; + + short val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2]; + data_cost_new[d * cdisp_step1] = val; + } + + __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global short *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x; + + __global short *u_new = u_new_ + y * cmsg_step1 + x; + __global short *d_new = d_new_ + y * cmsg_step1 + x; + __global short *l_new = l_new_ + y * cmsg_step1 + x; + __global short *r_new = r_new_ + y * cmsg_step1 + x; + + u_cur = u_cur_ + y/2 * cmsg_step2 + x/2; + d_cur = d_cur_ + y/2 * cmsg_step2 + x/2; + l_cur = l_cur_ + y/2 * cmsg_step2 + x/2; + r_cur = r_cur_ + y/2 * cmsg_step2 + x/2; + + get_first_k_element_increase_0(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur, + data_cost_selected, disparity_selected_new, data_cost_new, + data_cost, disparity_selected_cur, nr_plane, nr_plane2, + cdisp_step1, cdisp_step2); + } +} +__kernel void init_message_1(__global float *u_new_, __global float *d_new_, __global float *l_new_, + __global float *r_new_, __global const float *u_cur_, __global const float *d_cur_, + __global const float *l_cur_, __global const float *r_cur_, __global float *ctemp, + __global float *selected_disp_pyr_new, __global const float *selected_disp_pyr_cur, + __global float *data_cost_selected_, __global const float *data_cost_, + int h, int w, int nr_plane, int h2, int w2, int nr_plane2, + int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + + __global const float *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2; + __global const float *d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2; + __global const float *l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1); + __global const float *r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1); + + __global float *data_cost_new = ctemp + y * cmsg_step1 + x; + + __global const float *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2; + __global const float *data_cost = data_cost_ + y * cmsg_step1 + x; + + if (y < h && x < w) + { + for(int d = 0; d < nr_plane2; d++) + { + int idx2 = d * cdisp_step2; + + float val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2]; + data_cost_new[d * cdisp_step1] = val; + } + } + + __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x; + __global float *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x; + + __global float *u_new = u_new_ + y * cmsg_step1 + x; + __global float *d_new = d_new_ + y * cmsg_step1 + x; + __global float *l_new = l_new_ + y * cmsg_step1 + x; + __global float *r_new = r_new_ + y * cmsg_step1 + x; + + barrier(CLK_LOCAL_MEM_FENCE); + + if(y < h && x < w) + { + u_cur = u_cur_ + y/2 * cmsg_step2 + x/2; + d_cur = d_cur_ + y/2 * cmsg_step2 + x/2; + l_cur = l_cur_ + y/2 * cmsg_step2 + x/2; + r_cur = r_cur_ + y/2 * cmsg_step2 + x/2; + + for(int i = 0; i < nr_plane; i++) + { + float minimum = FLT_MAX; + int id = 0; + + for(int j = 0; j < nr_plane2; j++) + { + float cur = data_cost_new[j * cdisp_step1]; + if(cur < minimum) + { + minimum = cur; + id = j; + } + } + data_cost_selected[i * cdisp_step1] = data_cost[id * cdisp_step1]; + disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2]; + u_new[i * cdisp_step1] = u_cur[id * cdisp_step2]; + d_new[i * cdisp_step1] = d_cur[id * cdisp_step2]; + l_new[i * cdisp_step1] = l_cur[id * cdisp_step2]; + r_new[i * cdisp_step1] = r_cur[id * cdisp_step2]; + data_cost_new[id * cdisp_step1] = FLT_MAX; + } + } +} + +/////////////////////////////////////////////////////////////// +//////////////////// calc all iterations ///////////////////// +/////////////////////////////////////////////////////////////// +void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1, + __global const short *msg2, __global const short *msg3, + __global const short *dst_disp, __global const short *src_disp, + int nr_plane, __global short *temp, + float cmax_disc_term, int cdisp_step1, float cdisc_single_jump) +{ + short minimum = SHRT_MAX; + for(int d = 0; d < nr_plane; d++) + { + int idx = d * cdisp_step1; + short val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx]; + + if(val < minimum) + minimum = val; + + msg_dst[idx] = val; + } + + float sum = 0; + for(int d = 0; d < nr_plane; d++) + { + float cost_min = minimum + cmax_disc_term; + short src_disp_reg = src_disp[d * cdisp_step1]; + + for(int d2 = 0; d2 < nr_plane; d2++) + cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] + + cdisc_single_jump * abs(dst_disp[d2 * cdisp_step1] - src_disp_reg))); + + temp[d * cdisp_step1] = convert_short_sat_rte(cost_min); + sum += cost_min; + } + sum /= nr_plane; + + for(int d = 0; d < nr_plane; d++) + msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum); +} +void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1, + __global const float *msg2, __global const float *msg3, + __global const float *dst_disp, __global const float *src_disp, + int nr_plane, __global float *temp, + float cmax_disc_term, int cdisp_step1, float cdisc_single_jump) +{ + float minimum = FLT_MAX; + for(int d = 0; d < nr_plane; d++) + { + int idx = d * cdisp_step1; + float val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx]; + + if(val < minimum) + minimum = val; + + msg_dst[idx] = val; + } + + float sum = 0; + for(int d = 0; d < nr_plane; d++) + { + float cost_min = minimum + cmax_disc_term; + float src_disp_reg = src_disp[d * cdisp_step1]; + + for(int d2 = 0; d2 < nr_plane; d2++) + cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] + + cdisc_single_jump * fabs(dst_disp[d2 * cdisp_step1] - src_disp_reg))); + + temp[d * cdisp_step1] = cost_min; + sum += cost_min; + } + sum /= nr_plane; + + for(int d = 0; d < nr_plane; d++) + msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum; +} +__kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_, + __global const short *data_cost_selected, __global const short *selected_disp_pyr_cur, + __global short *ctemp, int h, int w, int nr_plane, int i, + float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump) +{ + int y = get_global_id(1); + int x = ((get_global_id(0)) << 1) + ((y + i) & 1); + + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + __global const short *data = data_cost_selected + y * cmsg_step1 + x; + + __global short *u = u_ + y * cmsg_step1 + x; + __global short *d = d_ + y * cmsg_step1 + x; + __global short *l = l_ + y * cmsg_step1 + x; + __global short *r = r_ + y * cmsg_step1 + x; + + __global const short *disp = selected_disp_pyr_cur + y * cmsg_step1 + x; + + __global short *temp = ctemp + y * cmsg_step1 + x; + + message_per_pixel_0(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_0(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_0(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_0(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + } +} +__kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_, + __global const float *data_cost_selected, __global const float *selected_disp_pyr_cur, + __global float *ctemp, int h, int w, int nr_plane, int i, + float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump) +{ + int y = get_global_id(1); + int x = ((get_global_id(0)) << 1) + ((y + i) & 1); + + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + __global const float *data = data_cost_selected + y * cmsg_step1 + x; + + __global float *u = u_ + y * cmsg_step1 + x; + __global float *d = d_ + y * cmsg_step1 + x; + __global float *l = l_ + y * cmsg_step1 + x; + __global float *r = r_ + y * cmsg_step1 + x; + + __global const float *disp = selected_disp_pyr_cur + y * cmsg_step1 + x; + __global float *temp = ctemp + y * cmsg_step1 + x; + + message_per_pixel_1(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_1(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_1(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + message_per_pixel_1(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp, + cmax_disc_term, cdisp_step1, cdisc_single_jump); + } +} + +/////////////////////////////////////////////////////////////// +/////////////////////////// output //////////////////////////// +/////////////////////////////////////////////////////////////// +__kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_, + __global const short *r_, __global const short * data_cost_selected, + __global const short *disp_selected_pyr, + __global short* disp, + int res_step, int cols, int rows, int nr_plane, + int cmsg_step1, int cdisp_step1) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1) + { + __global const short *data = data_cost_selected + y * cmsg_step1 + x; + __global const short *disp_selected = disp_selected_pyr + y * cmsg_step1 + x; + + __global const short *u = u_ + (y+1) * cmsg_step1 + (x+0); + __global const short *d = d_ + (y-1) * cmsg_step1 + (x+0); + __global const short *l = l_ + (y+0) * cmsg_step1 + (x+1); + __global const short *r = r_ + (y+0) * cmsg_step1 + (x-1); + + short best = 0; + short best_val = SHRT_MAX; + + for (int i = 0; i < nr_plane; ++i) + { + int idx = i * cdisp_step1; + short val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx]; + + if (val < best_val) + { + best_val = val; + best = disp_selected[idx]; + } + } + disp[res_step * y + x] = best; + } +} +__kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_, + __global const float *r_, __global const float *data_cost_selected, + __global const float *disp_selected_pyr, + __global short *disp, + int res_step, int cols, int rows, int nr_plane, + int cmsg_step1, int cdisp_step1) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1) + { + __global const float *data = data_cost_selected + y * cmsg_step1 + x; + __global const float *disp_selected = disp_selected_pyr + y * cmsg_step1 + x; + + __global const float *u = u_ + (y+1) * cmsg_step1 + (x+0); + __global const float *d = d_ + (y-1) * cmsg_step1 + (x+0); + __global const float *l = l_ + (y+0) * cmsg_step1 + (x+1); + __global const float *r = r_ + (y+0) * cmsg_step1 + (x-1); + + short best = 0; + short best_val = SHRT_MAX; + for (int i = 0; i < nr_plane; ++i) + { + int idx = i * cdisp_step1; + float val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx]; + + if (val < best_val) + { + best_val = val; + best = convert_short_sat_rte(disp_selected[idx]); + } + } + disp[res_step * y + x] = best; + } +} diff --git a/modules/ocl/src/opencl/tvl1flow.cl b/modules/ocl/src/opencl/tvl1flow.cl new file mode 100644 index 000000000..e0ff7307b --- /dev/null +++ b/modules/ocl/src/opencl/tvl1flow.cl @@ -0,0 +1,407 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jin Ma jin@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +__kernel void centeredGradientKernel(__global const float* src, int src_col, int src_row, int src_step, +__global float* dx, __global float* dy, int dx_step) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if((x < src_col)&&(y < src_row)) + { + int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1); + int src_x2 = (x - 1) > 0 ? (x -1) : 0; + + //if(src[y * src_step + src_x1] == src[y * src_step+ src_x2]) + //{ + // printf("y = %d\n", y); + // printf("src_x1 = %d\n", src_x1); + // printf("src_x2 = %d\n", src_x2); + //} + dx[y * dx_step+ x] = 0.5f * (src[y * src_step + src_x1] - src[y * src_step+ src_x2]); + + int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1); + int src_y2 = (y - 1) > 0 ? (y - 1) : 0; + dy[y * dx_step+ x] = 0.5f * (src[src_y1 * src_step + x] - src[src_y2 * src_step+ x]); + } + +} + +float bicubicCoeff(float x_) +{ + + float x = fabs(x_); + if (x <= 1.0f) + { + return x * x * (1.5f * x - 2.5f) + 1.0f; + } + else if (x < 2.0f) + { + return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f; + } + else + { + return 0.0f; + } + +} + +__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row, + image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y, + __global const float* u1, int u1_step, + __global const float* u2, + __global float* I1w, + __global float* I1wx, /*int I1wx_step,*/ + __global float* I1wy, /*int I1wy_step,*/ + __global float* grad, /*int grad_step,*/ + __global float* rho, + int I1w_step, + int u2_step, + int u1_offset_x, + int u1_offset_y, + int u2_offset_x, + int u2_offset_y) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if(x < I0_col&&y < I0_row) + { + //const float u1Val = u1(y, x); + const float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; + //const float u2Val = u2(y, x); + const float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; + + const float wx = x + u1Val; + const float wy = y + u2Val; + + const int xmin = ceil(wx - 2.0f); + const int xmax = floor(wx + 2.0f); + + const int ymin = ceil(wy - 2.0f); + const int ymax = floor(wy + 2.0f); + + float sum = 0.0f; + float sumx = 0.0f; + float sumy = 0.0f; + float wsum = 0.0f; + sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + + for (int cy = ymin; cy <= ymax; ++cy) + { + for (int cx = xmin; cx <= xmax; ++cx) + { + const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); + + //sum += w * tex2D(tex_I1 , cx, cy); + int2 cood = (int2)(cx, cy); + sum += w * read_imagef(tex_I1, sampleri, cood).x; + //sumx += w * tex2D(tex_I1x, cx, cy); + sumx += w * read_imagef(tex_I1x, sampleri, cood).x; + //sumy += w * tex2D(tex_I1y, cx, cy); + sumy += w * read_imagef(tex_I1y, sampleri, cood).x; + + wsum += w; + } + } + + const float coeff = 1.0f / wsum; + + const float I1wVal = sum * coeff; + const float I1wxVal = sumx * coeff; + const float I1wyVal = sumy * coeff; + + I1w[y * I1w_step + x] = I1wVal; + I1wx[y * I1w_step + x] = I1wxVal; + I1wy[y * I1w_step + x] = I1wyVal; + + const float Ix2 = I1wxVal * I1wxVal; + const float Iy2 = I1wyVal * I1wyVal; + + // store the |Grad(I1)|^2 + grad[y * I1w_step + x] = Ix2 + Iy2; + + // compute the constant part of the rho function + const float I0Val = I0[y * I0_step + x]; + rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val; + } + +} + +float readImage(__global const float *image, const int x, const int y, const int rows, const int cols, const int elemCntPerRow) +{ + int i0 = clamp(x, 0, cols - 1); + int j0 = clamp(y, 0, rows - 1); + int i1 = clamp(x + 1, 0, cols - 1); + int j1 = clamp(y + 1, 0, rows - 1); + + return image[j0 * elemCntPerRow + i0]; +} + +__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row, + __global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y, + __global const float* u1, int u1_step, + __global const float* u2, + __global float* I1w, + __global float* I1wx, /*int I1wx_step,*/ + __global float* I1wy, /*int I1wy_step,*/ + __global float* grad, /*int grad_step,*/ + __global float* rho, + int I1w_step, + int u2_step, + int I1_step, + int I1x_step) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if(x < I0_col&&y < I0_row) + { + //const float u1Val = u1(y, x); + const float u1Val = u1[y * u1_step + x]; + //const float u2Val = u2(y, x); + const float u2Val = u2[y * u2_step + x]; + + const float wx = x + u1Val; + const float wy = y + u2Val; + + const int xmin = ceil(wx - 2.0f); + const int xmax = floor(wx + 2.0f); + + const int ymin = ceil(wy - 2.0f); + const int ymax = floor(wy + 2.0f); + + float sum = 0.0f; + float sumx = 0.0f; + float sumy = 0.0f; + float wsum = 0.0f; + + for (int cy = ymin; cy <= ymax; ++cy) + { + for (int cx = xmin; cx <= xmax; ++cx) + { + const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); + + int2 cood = (int2)(cx, cy); + sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step); + sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step); + sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step); + wsum += w; + } + } + + const float coeff = 1.0f / wsum; + + const float I1wVal = sum * coeff; + const float I1wxVal = sumx * coeff; + const float I1wyVal = sumy * coeff; + + I1w[y * I1w_step + x] = I1wVal; + I1wx[y * I1w_step + x] = I1wxVal; + I1wy[y * I1w_step + x] = I1wyVal; + + const float Ix2 = I1wxVal * I1wxVal; + const float Iy2 = I1wyVal * I1wyVal; + + // store the |Grad(I1)|^2 + grad[y * I1w_step + x] = Ix2 + Iy2; + + // compute the constant part of the rho function + const float I0Val = I0[y * I0_step + x]; + rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val; + } + +} + + +__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step, + __global const float* u2, + __global float* p11, int p11_step, + __global float* p12, + __global float* p21, + __global float* p22, + const float taut, + int u2_step, + int u1_offset_x, + int u1_offset_y, + int u2_offset_x, + int u2_offset_y) +{ + + //const int x = blockIdx.x * blockDim.x + threadIdx.x; + //const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = get_global_id(0); + const int y = get_global_id(1); + + if(x < u1_col && y < u1_row) + { + int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1); + const float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; + + int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1); + const float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; + + int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1); + const float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; + + int src_y2 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1); + const float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; + + const float g1 = hypot(u1x, u1y); + const float g2 = hypot(u2x, u2y); + + const float ng1 = 1.0f + taut * g1; + const float ng2 = 1.0f + taut * g2; + + p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1; + p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1; + p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2; + p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2; + } + +} + +float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step) +{ + + if (x > 0 && y > 0) + { + const float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1]; + const float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x]; + return v1x + v2y; + } + else + { + if (y > 0) + return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0]; + else + { + if (x > 0) + return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x]; + else + return v1[0 * v1_step + 0] + v2[0 * v2_step + 0]; + } + } + +} + +__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step, + __global const float* I1wy, /*int I1wy_step,*/ + __global const float* grad, /*int grad_step,*/ + __global const float* rho_c, /*int rho_c_step,*/ + __global const float* p11, /*int p11_step,*/ + __global const float* p12, /*int p12_step,*/ + __global const float* p21, /*int p21_step,*/ + __global const float* p22, /*int p22_step,*/ + __global float* u1, int u1_step, + __global float* u2, + __global float* error, const float l_t, const float theta, int u2_step, + int u1_offset_x, + int u1_offset_y, + int u2_offset_x, + int u2_offset_y) +{ + + //const int x = blockIdx.x * blockDim.x + threadIdx.x; + //const int y = blockIdx.y * blockDim.y + threadIdx.y; + + int x = get_global_id(0); + int y = get_global_id(1); + + + if(x < I1wx_col && y < I1wx_row) + { + const float I1wxVal = I1wx[y * I1wx_step + x]; + const float I1wyVal = I1wy[y * I1wx_step + x]; + const float gradVal = grad[y * I1wx_step + x]; + const float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; + const float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; + + const float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal); + + // estimate the values of the variable (v1, v2) (thresholding operator TH) + + float d1 = 0.0f; + float d2 = 0.0f; + + if (rho < -l_t * gradVal) + { + d1 = l_t * I1wxVal; + d2 = l_t * I1wyVal; + } + else if (rho > l_t * gradVal) + { + d1 = -l_t * I1wxVal; + d2 = -l_t * I1wyVal; + } + else if (gradVal > 1.192092896e-07f) + { + const float fi = -rho / gradVal; + d1 = fi * I1wxVal; + d2 = fi * I1wyVal; + } + + const float v1 = u1OldVal + d1; + const float v2 = u2OldVal + d2; + + // compute the divergence of the dual variable (p1, p2) + + const float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step); + const float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step); + + // estimate the values of the optical flow (u1, u2) + + const float u1NewVal = v1 + theta * div_p1; + const float u2NewVal = v2 + theta * div_p2; + + u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal; + u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal; + + const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal); + const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal); + error[y * I1wx_step + x] = n1 + n2; + } + +} diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp new file mode 100644 index 000000000..f124488b9 --- /dev/null +++ b/modules/ocl/src/stereo_csbp.cpp @@ -0,0 +1,756 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jia Haipeng, jiahaipeng95@gmail.com +// Jin Ma, jin@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::ocl; +using namespace std; + +#if !defined (HAVE_OPENCL) + +namespace cv +{ + namespace ocl + { + + void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &) + { + throw_nogpu(); + } + cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) + { + throw_nogpu(); + } + cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, + float, float, int, int) + { + throw_nogpu(); + } + + void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &) + { + throw_nogpu(); + } + } +} + +#else /* !defined (HAVE_OPENCL) */ + +namespace cv +{ + namespace ocl + { + + ///////////////////////////OpenCL kernel Strings/////////////////////////// + extern const char *stereocsbp; + } + +} +namespace cv +{ + namespace ocl + { + namespace stereoCSBP + { + ////////////////////////////////////////////////////////////////////////// + //////////////////////////////common//////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + static inline int divUp(int total, int grain) + { + return (total + grain - 1) / grain; + } + static String get_kernel_name(String kernel_name, int data_type) + { + return kernel_name + (data_type == CV_16S ? "0" : "1"); + } + using cv::ocl::StereoConstantSpaceBP; + ////////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////init_data_cost////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////// + static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp, + StereoConstantSpaceBP &rthis, + int msg_step, int h, int w, int level) + { + Context *clCxt = left.clCxt; + int data_type = rthis.msg_type; + int channels = left.oclchannels(); + + String kernelName = get_kernel_name("init_data_cost_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8 ,1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int cdisp_step1 = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + + static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp, + StereoConstantSpaceBP &rthis, + int msg_step, int h, int w, int level) + { + + Context *clCxt = left.clCxt; + int data_type = rthis.msg_type; + int channels = left.oclchannels(); + int win_size = (int)std::pow(2.f, level); + + String kernelName = get_kernel_name("init_data_cost_reduce_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + const int threadsNum = 256; + //size_t blockSize = threadsNum; + size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; + size_t globalThreads[3] = {w *localThreads[0], + h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2] + }; + + int local_mem_size = threadsNum * sizeof(float); + int cdisp_step1 = msg_step * h; + + openCLVerifyKernel(clCxt, kernel, localThreads); + + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, local_mem_size, (void *)NULL)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&win_size)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + + static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, + oclMat &temp, StereoConstantSpaceBP &rthis, + int h, int w, int nr_plane, int msg_step) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + String kernelName = get_kernel_name("get_first_k_initial_local_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8 ,1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, + oclMat &temp, StereoConstantSpaceBP &rthis, + int h, int w, int nr_plane, int msg_step) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + String kernelName = get_kernel_name("get_first_k_initial_global_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + + static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis, + uchar *disp_selected_pyr, uchar *data_cost_selected, + size_t msg_step, int h, int w, int level, int nr_plane) + { + + if(level <= 1) + init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level); + else + init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level); + + if(rthis.use_local_init_data_cost == true) + { + get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step); + } + else + { + get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, + nr_plane, msg_step); + } + } + + /////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////compute_data_cost////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost, + StereoConstantSpaceBP &rthis, int msg_step1, + int msg_step2, const oclMat &left, const oclMat &right, int h, + int w, int h2, int level, int nr_plane) + { + Context *clCxt = left.clCxt; + int channels = left.oclchannels(); + int data_type = rthis.msg_type; + + String kernelName = get_kernel_name("compute_data_cost_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step1 = msg_step1 * h; + int disp_step2 = msg_step2 * h2; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&msg_step1)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&msg_step2)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&disp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step2)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost, + StereoConstantSpaceBP &rthis, int msg_step1, + int msg_step2, const oclMat &left, const oclMat &right, int h, + int w, int h2, int level, int nr_plane) + { + Context *clCxt = left.clCxt; + int data_type = rthis.msg_type; + int channels = left.oclchannels(); + int win_size = (int)std::pow(2.f, level); + + String kernelName = get_kernel_name("compute_data_cost_reduce_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + const size_t threadsNum = 256; + //size_t blockSize = threadsNum; + size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; + size_t globalThreads[3] = {w *localThreads[0], + h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2] + }; + + int disp_step1 = msg_step1 * h; + int disp_step2 = msg_step2 * h2; + size_t local_mem_size = threadsNum * sizeof(float); + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data)); + openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size, (void *)NULL)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.rows)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.cols)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&channels)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&win_size)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&msg_step1)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step2)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&disp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&disp_step2)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight)); + openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term)); + openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step)); + openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis, + int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w, + int h2, int level, int nr_plane) + { + if(level <= 1) + compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2, + left, right, h, w, h2, level, nr_plane); + else + compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2, + left, right, h, w, h2, level, nr_plane); + } + //////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////init message////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////// + static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new, + uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur, + uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur, + uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis, + size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane, + int h2, int w2, int nr_plane2) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + String kernelName = get_kernel_name("init_message_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step1 = msg_step1 * h; + int disp_step2 = msg_step2 * h2; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u_new)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_new)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l_new)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r_new)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&u_cur)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_cur)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&l_cur)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&r_cur)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&disp_selected_pyr_new)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2)); + openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2)); + openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2)); + openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1)); + openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2)); + openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1)); + openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + //////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////calc_all_iterations//////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////////////// + static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, + uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis, + int msg_step, int h, int w, int nr_plane, int i) + { + Context *clCxt = temp.clCxt; + int data_type = rthis.msg_type; + + String kernelName = get_kernel_name("compute_message_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0], + divUp(h, localThreads[1]) *localThreads[1], + 1 + }; + + int disp_step = msg_step * h; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&temp.data)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&w)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&i)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, + uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis, + int msg_step, int h, int w, int nr_plane) + { + for(int t = 0; t < rthis.iters; t++) + calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis, + msg_step, h, w, nr_plane, t & 1); + } + + /////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////compute_disp//////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////// + static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, + uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step, + oclMat &disp, int nr_plane) + { + Context *clCxt = disp.clCxt; + int data_type = rthis.msg_type; + + String kernelName = get_kernel_name("compute_disp_", data_type); + + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); + + //size_t blockSize = 256; + size_t localThreads[] = {32, 8, 1}; + size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0], + divUp(disp.rows, localThreads[1]) *localThreads[1], + 1 + }; + + int step_size = disp.step / disp.elemSize(); + int disp_step = disp.rows * msg_step; + openCLVerifyKernel(clCxt, kernel, localThreads); + openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u)); + openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d)); + openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l)); + openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r)); + openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected)); + openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr)); + openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&disp.data)); + openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&step_size)); + openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&disp.cols)); + openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&disp.rows)); + openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane)); + openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step)); + openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); + openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, + globalThreads, localThreads, 0, NULL, NULL)); + + clFinish(*(cl_command_queue*)getoclCommandQueue()); + openCLSafeCall(clReleaseKernel(kernel)); + } + } + } +} +namespace +{ + const float DEFAULT_MAX_DATA_TERM = 30.0f; + const float DEFAULT_DATA_WEIGHT = 1.0f; + const float DEFAULT_MAX_DISC_TERM = 160.0f; + const float DEFAULT_DISC_SINGLE_JUMP = 10.0f; +} + +void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane) +{ + ndisp = (int) ((float) width / 3.14f); + if ((ndisp & 1) != 0) + ndisp++; + + int mm = ::max(width, height); + iters = mm / 100 + ((mm > 1200) ? - 4 : 4); + + levels = (int)::log(static_cast(mm)) * 2 / 3; + if (levels == 0) levels++; + + nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1)); +} + +cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_, + int msg_type_) + + : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_), + max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT), + max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0), + msg_type(msg_type_), use_local_init_data_cost(true) +{ + CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S); +} + + +cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_, + float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, + int min_disp_th_, int msg_type_) + : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_), + max_data_term(max_data_term_), data_weight(data_weight_), + max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_), + msg_type(msg_type_), use_local_init_data_cost(true) +{ + CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S); +} + +template +static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2], + oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected, + oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp) +{ + CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane + && left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); + + CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3)); + + const Scalar zero = Scalar::all(0); + + ////////////////////////////////////Init/////////////////////////////////////////////////// + int rows = left.rows; + int cols = left.cols; + + rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); + int levels = rthis.levels; + + AutoBuffer buf(levels * 4); + + int *cols_pyr = buf; + int *rows_pyr = cols_pyr + levels; + int *nr_plane_pyr = rows_pyr + levels; + int *step_pyr = nr_plane_pyr + levels; + + cols_pyr[0] = cols; + rows_pyr[0] = rows; + nr_plane_pyr[0] = rthis.nr_plane; + + const int n = 64; + step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T); + for (int i = 1; i < levels; i++) + { + cols_pyr[i] = cols_pyr[i - 1] / 2; + rows_pyr[i] = rows_pyr[i - 1]/ 2; + + nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2; + + step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T); + } + + Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]); + Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2); + + u[0].create(msg_size, DataType::type); + d[0].create(msg_size, DataType::type); + l[0].create(msg_size, DataType::type); + r[0].create(msg_size, DataType::type); + + u[1].create(msg_size, DataType::type); + d[1].create(msg_size, DataType::type); + l[1].create(msg_size, DataType::type); + r[1].create(msg_size, DataType::type); + + disp_selected_pyr[0].create(msg_size, DataType::type); + disp_selected_pyr[1].create(msg_size, DataType::type); + + data_cost.create(data_cost_size, DataType::type); + data_cost_selected.create(msg_size, DataType::type); + + Size temp_size = data_cost_size; + if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp) + temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp); + + temp.create(temp_size, DataType::type); + temp = zero; + + ///////////////////////////////// Compute//////////////////////////////////////////////// + + //csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, + // rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); + + l[0] = zero; + d[0] = zero; + r[0] = zero; + u[0] = zero; + disp_selected_pyr[0] = zero; + + l[1] = zero; + d[1] = zero; + r[1] = zero; + u[1] = zero; + disp_selected_pyr[1] = zero; + + data_cost = zero; + + data_cost_selected = zero; + + int cur_idx = 0; + + for (int i = levels - 1; i >= 0; i--) + { + if (i == levels - 1) + { + cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data, + data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i], + i, nr_plane_pyr[i]); + } + else + { + cv::ocl::stereoCSBP::compute_data_cost( + disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0], + step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i, + nr_plane_pyr[i + 1]); + + int new_idx = (cur_idx + 1) & 1; + + cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data, + u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, + disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data, + data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0], + step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1], + cols_pyr[i + 1], nr_plane_pyr[i + 1]); + cur_idx = new_idx; + } + cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, + data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp, + rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]); + } + + if (disp.empty()) + disp.create(rows, cols, CV_16S); + + out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); + out = zero; + + stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, + data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0], + out, nr_plane_pyr[0]); + if (disp.type() != CV_16S) + out.convertTo(disp, disp.type()); +} + + +typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2], + oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected, + oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp); + +const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator, 0, csbp_operator, 0, 0}; + +void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp) +{ + + CV_Assert(msg_type == CV_32F || msg_type == CV_16S); + operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, + left, right, disp); +} + +#endif /* !defined (HAVE_OPENCL) */ diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp index 5eab75a65..a5cbe2b9f 100644 --- a/modules/ocl/src/stereobm.cpp +++ b/modules/ocl/src/stereobm.cpp @@ -72,28 +72,21 @@ namespace stereoBM //////////////////////////////////////////////////////////////////////// static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap) { - Context *clCxt = input.clCxt; - String kernelName = "prefilter_xsobel"; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName); size_t blockSize = 1; size_t globalThreads[3] = { input.cols, input.rows, 1 }; size_t localThreads[3] = { blockSize, blockSize, 1 }; - openCLVerifyKernel(clCxt, kernel, localThreads); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap)); - - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, - globalThreads, localThreads, 0, NULL, NULL)); - - clFinish((cl_command_queue)clCxt->oclCommandQueue()); - openCLSafeCall(clReleaseKernel(kernel)); + std::vector< std::pair > args; + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap)); + openCLExecuteKernel(Context::getContext(), &stereobm, kernelName, + globalThreads, localThreads, args, -1, -1); } ////////////////////////////////////////////////////////////////////////// //////////////////////////////common//////////////////////////////////// @@ -113,16 +106,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, { int winsz2 = winSize >> 1; - Context *clCxt = left.clCxt; - String kernelName = "stereoKernel"; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName); disp.setTo(Scalar_::all(0)); minSSD_buf.setTo(Scalar_::all(0xFFFFFFFF)); size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize(); - size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * + size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * sizeof(cl_uint); //size_t blockSize = 1; size_t localThreads[] = { BLOCK_W, 1,1}; @@ -131,26 +121,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, 1 }; - openCLVerifyKernel(clCxt, kernel, localThreads); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data)); - openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step)); - openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); - openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows)); - openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step)); - openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp)); - openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2)); - openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL)); + std::vector< std::pair > args; + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp)); + args.push_back(std::make_pair(local_mem_size, (void *)NULL)); - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL, - globalThreads, localThreads, 0, NULL, NULL)); - - - clFinish((cl_command_queue)clCxt->oclCommandQueue()); - openCLSafeCall(clReleaseKernel(kernel)); + char opt [128]; + sprintf(opt, "-D radius=%d", winsz2); + openCLExecuteKernel(Context::getContext(), &stereobm, kernelName, + globalThreads, localThreads, args, -1, -1, opt); } //////////////////////////////////////////////////////////////////////////// ///////////////////////////////postfilter_textureness/////////////////////// @@ -158,10 +145,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, static void postfilter_textureness(oclMat &left, int winSize, float avergeTexThreshold, oclMat &disparity) { - Context *clCxt = left.clCxt; - String kernelName = "textureness_kernel"; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName); size_t blockSize = 1; size_t localThreads[] = { BLOCK_W, blockSize ,1}; @@ -172,22 +156,19 @@ static void postfilter_textureness(oclMat &left, int winSize, size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float); - openCLVerifyKernel(clCxt, kernel, localThreads); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data)); - openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows)); - openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); - openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize)); - openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold)); - openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL)); - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL, - globalThreads, localThreads, 0, NULL, NULL)); - - clFinish((cl_command_queue)clCxt->oclCommandQueue()); - openCLSafeCall(clReleaseKernel(kernel)); + std::vector< std::pair > args; + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize)); + args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold)); + args.push_back(std::make_pair(local_mem_size, (void*)NULL)); + openCLExecuteKernel(Context::getContext(), &stereobm, kernelName, + globalThreads, localThreads, args, -1, -1); } ////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////operator///////////////////////////////// diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp new file mode 100644 index 000000000..8362672d2 --- /dev/null +++ b/modules/ocl/src/tvl1flow.cpp @@ -0,0 +1,475 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Jin Ma, jin@multicorewareinc.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#include "precomp.hpp" +using namespace std; +using namespace cv; +using namespace cv::ocl; + +namespace cv +{ + namespace ocl + { + ///////////////////////////OpenCL kernel strings/////////////////////////// + extern const char* tvl1flow; + } +} + +cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL() +{ + tau = 0.25; + lambda = 0.15; + theta = 0.3; + nscales = 5; + warps = 5; + epsilon = 0.01; + iterations = 300; + useInitialFlow = false; +} + +void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy) +{ + CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 ); + CV_Assert( I0.size() == I1.size() ); + CV_Assert( I0.type() == I1.type() ); + CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) ); + CV_Assert( nscales > 0 ); + + // allocate memory for the pyramid structure + I0s.resize(nscales); + I1s.resize(nscales); + u1s.resize(nscales); + u2s.resize(nscales); + //I0s_step == I1s_step + I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0); + I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0); + + + if (!useInitialFlow) + { + flowx.create(I0.size(), CV_32FC1); + flowy.create(I0.size(), CV_32FC1); + } + //u1s_step != u2s_step + u1s[0] = flowx; + u2s[0] = flowy; + + I1x_buf.create(I0.size(), CV_32FC1); + I1y_buf.create(I0.size(), CV_32FC1); + + I1w_buf.create(I0.size(), CV_32FC1); + I1wx_buf.create(I0.size(), CV_32FC1); + I1wy_buf.create(I0.size(), CV_32FC1); + + grad_buf.create(I0.size(), CV_32FC1); + rho_c_buf.create(I0.size(), CV_32FC1); + + p11_buf.create(I0.size(), CV_32FC1); + p12_buf.create(I0.size(), CV_32FC1); + p21_buf.create(I0.size(), CV_32FC1); + p22_buf.create(I0.size(), CV_32FC1); + + diff_buf.create(I0.size(), CV_32FC1); + + // create the scales + for (int s = 1; s < nscales; ++s) + { + ocl::pyrDown(I0s[s - 1], I0s[s]); + ocl::pyrDown(I1s[s - 1], I1s[s]); + + if (I0s[s].cols < 16 || I0s[s].rows < 16) + { + nscales = s; + break; + } + + if (useInitialFlow) + { + ocl::pyrDown(u1s[s - 1], u1s[s]); + ocl::pyrDown(u2s[s - 1], u2s[s]); + + //ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]); + multiply(0.5, u1s[s], u1s[s]); + //ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]); + multiply(0.5, u1s[s], u2s[s]); + } + } + + // pyramidal structure for computing the optical flow + for (int s = nscales - 1; s >= 0; --s) + { + // compute the optical flow at the current scale + procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]); + + // if this was the last scale, finish now + if (s == 0) + break; + + // otherwise, upsample the optical flow + + // zoom the optical flow for the next finer scale + ocl::resize(u1s[s], u1s[s - 1], I0s[s - 1].size()); + ocl::resize(u2s[s], u2s[s - 1], I0s[s - 1].size()); + + // scale the optical flow with the appropriate zoom factor + multiply(2, u1s[s - 1], u1s[s - 1]); + multiply(2, u2s[s - 1], u2s[s - 1]); + + } + +} + +namespace ocl_tvl1flow +{ + void centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy); + + void warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, + oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, + oclMat &grad, oclMat &rho); + + void estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad, + oclMat &rho_c, oclMat &p11, oclMat &p12, + oclMat &p21, oclMat &p22, oclMat &u1, + oclMat &u2, oclMat &error, float l_t, float theta); + + void estimateDualVariables(oclMat &u1, oclMat &u2, + oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut); +} + +void cv::ocl::OpticalFlowDual_TVL1_OCL::procOneScale(const oclMat &I0, const oclMat &I1, oclMat &u1, oclMat &u2) +{ + using namespace ocl_tvl1flow; + + const double scaledEpsilon = epsilon * epsilon * I0.size().area(); + + CV_DbgAssert( I1.size() == I0.size() ); + CV_DbgAssert( I1.type() == I0.type() ); + CV_DbgAssert( u1.empty() || u1.size() == I0.size() ); + CV_DbgAssert( u2.size() == u1.size() ); + + if (u1.empty()) + { + u1.create(I0.size(), CV_32FC1); + u1.setTo(Scalar::all(0)); + + u2.create(I0.size(), CV_32FC1); + u2.setTo(Scalar::all(0)); + } + + oclMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows)); + + centeredGradient(I1, I1x, I1y); + + oclMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows)); + + oclMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows)); + + oclMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows)); + oclMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows)); + p11.setTo(Scalar::all(0)); + p12.setTo(Scalar::all(0)); + p21.setTo(Scalar::all(0)); + p22.setTo(Scalar::all(0)); + + oclMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows)); + + const float l_t = static_cast(lambda * theta); + const float taut = static_cast(tau / theta); + + for (int warpings = 0; warpings < warps; ++warpings) + { + warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c); + + double error = numeric_limits::max(); + for (int n = 0; error > scaledEpsilon && n < iterations; ++n) + { + estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, + u1, u2, diff, l_t, static_cast(theta)); + + error = ocl::sum(diff)[0]; + + estimateDualVariables(u1, u2, p11, p12, p21, p22, taut); + + } + } + +} + +void cv::ocl::OpticalFlowDual_TVL1_OCL::collectGarbage() +{ + I0s.clear(); + I1s.clear(); + u1s.clear(); + u2s.clear(); + + I1x_buf.release(); + I1y_buf.release(); + + I1w_buf.release(); + I1wx_buf.release(); + I1wy_buf.release(); + + grad_buf.release(); + rho_c_buf.release(); + + p11_buf.release(); + p12_buf.release(); + p21_buf.release(); + p22_buf.release(); + + diff_buf.release(); + norm_buf.release(); +} + +void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy) +{ + Context *clCxt = src.clCxt; + size_t localThreads[3] = {32, 8, 1}; + size_t globalThreads[3] = {src.cols, src.rows, 1}; + + int srcElementSize = src.elemSize(); + int src_step = src.step/srcElementSize; + + int dElememntSize = dx.elemSize(); + int dx_step = dx.step/dElememntSize; + + String kernelName = "centeredGradientKernel"; + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void*)&src.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&src.cols)); + args.push_back( make_pair( sizeof(cl_int), (void*)&src.rows)); + args.push_back( make_pair( sizeof(cl_int), (void*)&src_step)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&dx.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&dy.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&dx_step)); + openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1); + +} + +void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut) +{ + Context *clCxt = u1.clCxt; + + size_t localThread[] = {32, 8, 1}; + size_t globalThread[] = + { + u1.cols, + u1.rows, + 1 + }; + + int u1_element_size = u1.elemSize(); + int u1_step = u1.step/u1_element_size; + + int u2_element_size = u2.elemSize(); + int u2_step = u2.step/u2_element_size; + + int p11_element_size = p11.elemSize(); + int p11_step = p11.step/p11_element_size; + + int u1_offset_y = u1.offset/u1.step; + int u1_offset_x = u1.offset%u1.step; + u1_offset_x = u1_offset_x/u1.elemSize(); + + int u2_offset_y = u2.offset/u2.step; + int u2_offset_x = u2.offset%u2.step; + u2_offset_x = u2_offset_x/u2.elemSize(); + + String kernelName = "estimateDualVariablesKernel"; + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1.cols)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1.rows)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&p11_step)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data)); + args.push_back( make_pair( sizeof(cl_float), (void*)&taut)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y)); + + openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1); +} + +void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad, + oclMat &rho_c, oclMat &p11, oclMat &p12, + oclMat &p21, oclMat &p22, oclMat &u1, + oclMat &u2, oclMat &error, float l_t, float theta) +{ + Context* clCxt = I1wx.clCxt; + + size_t localThread[] = {32, 8, 1}; + size_t globalThread[] = + { + I1wx.cols, + I1wx.rows, + 1 + }; + + int I1wx_element_size = I1wx.elemSize(); + int I1wx_step = I1wx.step/I1wx_element_size; + + int u1_element_size = u1.elemSize(); + int u1_step = u1.step/u1_element_size; + + int u2_element_size = u2.elemSize(); + int u2_step = u2.step/u2_element_size; + + int u1_offset_y = u1.offset/u1.step; + int u1_offset_x = u1.offset%u1.step; + u1_offset_x = u1_offset_x/u1.elemSize(); + + int u2_offset_y = u2.offset/u2.step; + int u2_offset_x = u2.offset%u2.step; + u2_offset_x = u2_offset_x/u2.elemSize(); + + String kernelName = "estimateUKernel"; + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.cols)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.rows)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx_step)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&rho_c.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&error.data)); + args.push_back( make_pair( sizeof(cl_float), (void*)&l_t)); + args.push_back( make_pair( sizeof(cl_float), (void*)&theta)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y)); + + openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1); +} + +void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, oclMat &grad, oclMat &rho) +{ + Context* clCxt = I0.clCxt; + const bool isImgSupported = support_image2d(clCxt); + + CV_Assert(isImgSupported); + + int u1ElementSize = u1.elemSize(); + int u1Step = u1.step/u1ElementSize; + + int u2ElementSize = u2.elemSize(); + int u2Step = u2.step/u2ElementSize; + + int I0ElementSize = I0.elemSize(); + int I0Step = I0.step/I0ElementSize; + + int I1w_element_size = I1w.elemSize(); + int I1w_step = I1w.step/I1w_element_size; + + int u1_offset_y = u1.offset/u1.step; + int u1_offset_x = u1.offset%u1.step; + u1_offset_x = u1_offset_x/u1.elemSize(); + + int u2_offset_y = u2.offset/u2.step; + int u2_offset_x = u2.offset%u2.step; + u2_offset_x = u2_offset_x/u2.elemSize(); + + size_t localThread[] = {32, 8, 1}; + size_t globalThread[] = + { + I0.cols, + I0.rows, + 1 + }; + + cl_mem I1_tex; + cl_mem I1x_tex; + cl_mem I1y_tex; + I1_tex = bindTexture(I1); + I1x_tex = bindTexture(I1x); + I1y_tex = bindTexture(I1y); + + String kernelName = "warpBackwardKernel"; + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void*)&I0.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I0Step)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I0.cols)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I0.rows)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1_tex)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1x_tex)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1y_tex)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1Step)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1w.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data)); + args.push_back( make_pair( sizeof(cl_mem), (void*)&rho.data)); + args.push_back( make_pair( sizeof(cl_int), (void*)&I1w_step)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2Step)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x)); + args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y)); + + openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1); +} diff --git a/modules/ocl/test/interpolation.hpp b/modules/ocl/test/interpolation.hpp deleted file mode 100644 index fb89e701d..000000000 --- a/modules/ocl/test/interpolation.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// Intel License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000, Intel Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of Intel Corporation may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_TEST_INTERPOLATION_HPP__ -#define __OPENCV_TEST_INTERPOLATION_HPP__ - -template T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) -{ - if (border_type == cv::BORDER_CONSTANT) - return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at(y, x * src.channels() + c) : cv::saturate_cast(borderVal.val[c]); - - return src.at(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c); -} - -template struct NearestInterpolator -{ - static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) - { - return readVal(src, cvFloor(y), cvFloor(x), c, border_type, borderVal); - } -}; - -template struct LinearInterpolator -{ - static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) - { - x -= 0.5f; - y -= 0.5f; - - int x1 = cvFloor(x); - int y1 = cvFloor(y); - int x2 = x1 + 1; - int y2 = y1 + 1; - - float res = 0; - - res += readVal(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y)); - res += readVal(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y)); - res += readVal(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1)); - res += readVal(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1)); - - return cv::saturate_cast(res); - } -}; - -template struct CubicInterpolator -{ - static float getValue(float p[4], float x) - { - return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); - } - - static float getValue(float p[4][4], float x, float y) - { - float arr[4]; - - arr[0] = getValue(p[0], x); - arr[1] = getValue(p[1], x); - arr[2] = getValue(p[2], x); - arr[3] = getValue(p[3], x); - - return getValue(arr, y); - } - - static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) - { - int ix = cvRound(x); - int iy = cvRound(y); - - float vals[4][4] = - { - {readVal(src, iy - 2, ix - 2, c, border_type, borderVal), readVal(src, iy - 2, ix - 1, c, border_type, borderVal), readVal(src, iy - 2, ix, c, border_type, borderVal), readVal(src, iy - 2, ix + 1, c, border_type, borderVal)}, - {readVal(src, iy - 1, ix - 2, c, border_type, borderVal), readVal(src, iy - 1, ix - 1, c, border_type, borderVal), readVal(src, iy - 1, ix, c, border_type, borderVal), readVal(src, iy - 1, ix + 1, c, border_type, borderVal)}, - {readVal(src, iy , ix - 2, c, border_type, borderVal), readVal(src, iy , ix - 1, c, border_type, borderVal), readVal(src, iy , ix, c, border_type, borderVal), readVal(src, iy , ix + 1, c, border_type, borderVal)}, - {readVal(src, iy + 1, ix - 2, c, border_type, borderVal), readVal(src, iy + 1, ix - 1, c, border_type, borderVal), readVal(src, iy + 1, ix, c, border_type, borderVal), readVal(src, iy + 1, ix + 1, c, border_type, borderVal)}, - }; - - return cv::saturate_cast(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0)); - } -}; - -#endif // __OPENCV_TEST_INTERPOLATION_HPP__ diff --git a/modules/ocl/test/precomp.hpp b/modules/ocl/test/precomp.hpp index 56efdabaa..72d5089d6 100644 --- a/modules/ocl/test/precomp.hpp +++ b/modules/ocl/test/precomp.hpp @@ -68,9 +68,7 @@ #include "opencv2/ocl.hpp" #include "utility.hpp" -#include "interpolation.hpp" #include "opencv2/core/private.hpp" #endif - diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index e46fdbddd..d12cef0b2 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -21,6 +22,7 @@ // Jiang Liyuan,jlyuan001.good@163.com // Rock Li, Rock.Li@amd.com // Zailong Wu, bullet@yeah.net +// Yao Wang, bitwangyaoyao@gmail.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -87,14 +89,13 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool) int maskx; int masky; - //src mat with roi cv::Mat mat1_roi; cv::Mat mat2_roi; cv::Mat mask_roi; cv::Mat dst_roi; cv::Mat dst1_roi; //bak - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; cv::ocl::oclMat gdst1_whole; //bak @@ -125,10 +126,6 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool) val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0)); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -175,14 +172,22 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool) gmask = mask_roi; //end } + void Near(double threshold = 0.) + { + EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold); + } + + void Near1(double threshold = 0.) + { + EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold); + } + }; ////////////////////////////////lut///////////////////////////////////////////////// - struct Lut : ArithmTestBase {}; #define VARNAME(A) string(#A); - TEST_P(Lut, Mat) { @@ -203,20 +208,12 @@ TEST_P(Lut, Mat) cv::LUT(mat1_roi, mat2_roi, dst_roi); cv::ocl::LUT(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download (cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0, s); + Near(0); } } - - ////////////////////////////////exp///////////////////////////////////////////////// - struct Exp : ArithmTestBase {}; TEST_P(Exp, Mat) @@ -227,20 +224,12 @@ TEST_P(Exp, Mat) cv::exp(mat1_roi, dst_roi); cv::ocl::exp(gmat1, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 2, s); - + Near(2); } } ////////////////////////////////log///////////////////////////////////////////////// - struct Log : ArithmTestBase {}; TEST_P(Log, Mat) @@ -249,24 +238,14 @@ TEST_P(Log, Mat) { random_roi(); - cv::log(mat1_roi, dst_roi); cv::ocl::log(gmat1, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1, s); - + Near(1); } } - - ////////////////////////////////add///////////////////////////////////////////////// - struct Add : ArithmTestBase {}; TEST_P(Add, Mat) @@ -277,12 +256,7 @@ TEST_P(Add, Mat) cv::add(mat1_roi, mat2_roi, dst_roi); cv::ocl::add(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s); + Near(0); } } @@ -294,14 +268,10 @@ TEST_P(Add, Mat_Mask) cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi); cv::ocl::add(gmat1, gmat2, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s); + Near(0); } } + TEST_P(Add, Scalar) { for(int j = 0; j < LOOP_TIMES; j++) @@ -310,12 +280,7 @@ TEST_P(Add, Scalar) cv::add(mat1_roi, val, dst_roi); cv::ocl::add(gmat1, val, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s); + Near(1e-5); } } @@ -327,12 +292,7 @@ TEST_P(Add, Scalar_Mask) cv::add(mat1_roi, val, dst_roi, mask_roi); cv::ocl::add(gmat1, val, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s); + Near(1e-5); } } @@ -349,12 +309,7 @@ TEST_P(Sub, Mat) cv::subtract(mat1_roi, mat2_roi, dst_roi); cv::ocl::subtract(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s); + Near(0); } } @@ -366,14 +321,10 @@ TEST_P(Sub, Mat_Mask) cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi); cv::ocl::subtract(gmat1, gmat2, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s); + Near(0); } } + TEST_P(Sub, Scalar) { for(int j = 0; j < LOOP_TIMES; j++) @@ -382,12 +333,7 @@ TEST_P(Sub, Scalar) cv::subtract(mat1_roi, val, dst_roi); cv::ocl::subtract(gmat1, val, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s); + Near(1e-5); } } @@ -399,12 +345,7 @@ TEST_P(Sub, Scalar_Mask) cv::subtract(mat1_roi, val, dst_roi, mask_roi); cv::ocl::subtract(gmat1, val, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s); + Near(1e-5); } } @@ -421,12 +362,7 @@ TEST_P(Mul, Mat) cv::multiply(mat1_roi, mat2_roi, dst_roi); cv::ocl::multiply(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char s[1024]; - sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s); + Near(0); } } @@ -441,12 +377,7 @@ TEST_P(Mul, Mat_Scalar) cv::multiply(mat1_roi, mat2_roi, dst_roi, s); cv::ocl::multiply(gmat1, gmat2, gdst, s); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.001, sss); + Near(.001); } } @@ -462,13 +393,7 @@ TEST_P(Div, Mat) cv::divide(mat1_roi, mat2_roi, dst_roi); cv::ocl::divide(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss); + Near(1); } } @@ -483,13 +408,7 @@ TEST_P(Div, Mat_Scalar) cv::divide(mat1_roi, mat2_roi, dst_roi, s); cv::ocl::divide(gmat1, gmat2, gdst, s); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.001, sss); + Near(.001); } } @@ -504,13 +423,7 @@ TEST_P(Absdiff, Mat) cv::absdiff(mat1_roi, mat2_roi, dst_roi); cv::ocl::absdiff(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0, sss); + Near(0); } } @@ -522,13 +435,7 @@ TEST_P(Absdiff, Mat_Scalar) cv::absdiff(mat1_roi, val, dst_roi); cv::ocl::absdiff(gmat1, val, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -544,16 +451,8 @@ TEST_P(CartToPolar, angleInDegree) cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1); cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - cv::Mat cpu_dst1; - gdst1_whole.download(cpu_dst1); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss); - EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss); + Near(.5); + Near1(.5); } } @@ -565,22 +464,12 @@ TEST_P(CartToPolar, angleInRadians) cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0); cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - cv::Mat cpu_dst1; - gdst1_whole.download(cpu_dst1); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss); - EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss); + Near(.5); + Near1(.5); } } - - struct PolarToCart : ArithmTestBase {}; TEST_P(PolarToCart, angleInDegree) @@ -591,17 +480,8 @@ TEST_P(PolarToCart, angleInDegree) cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1); cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - cv::Mat cpu_dst1; - gdst1_whole.download(cpu_dst1); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss); - EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss); + Near(.5); + Near1(.5); } } @@ -613,17 +493,8 @@ TEST_P(PolarToCart, angleInRadians) cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0); cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - cv::Mat cpu_dst1; - gdst1_whole.download(cpu_dst1); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss); - EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss); + Near(.5); + Near1(.5); } } @@ -640,19 +511,11 @@ TEST_P(Magnitude, Mat) cv::magnitude(mat1_roi, mat2_roi, dst_roi); cv::ocl::magnitude(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } - - struct Transpose : ArithmTestBase {}; TEST_P(Transpose, Mat) @@ -663,20 +526,11 @@ TEST_P(Transpose, Mat) cv::transpose(mat1_roi, dst_roi); cv::ocl::transpose(gmat1, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } - - - struct Flip : ArithmTestBase {}; TEST_P(Flip, X) @@ -687,13 +541,7 @@ TEST_P(Flip, X) cv::flip(mat1_roi, dst_roi, 0); cv::ocl::flip(gmat1, gdst, 0); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -705,13 +553,7 @@ TEST_P(Flip, Y) cv::flip(mat1_roi, dst_roi, 1); cv::ocl::flip(gmat1, gdst, 1); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -723,18 +565,11 @@ TEST_P(Flip, BOTH) cv::flip(mat1_roi, dst_roi, -1); cv::ocl::flip(gmat1, gdst, -1); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } - struct MinMax : ArithmTestBase {}; TEST_P(MinMax, MAT) @@ -765,12 +600,8 @@ TEST_P(MinMax, MAT) double minVal_, maxVal_; cv::ocl::minMax(gmat1, &minVal_, &maxVal_); - //check results - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_DOUBLE_EQ(minVal_, minVal) << sss; - EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss; + EXPECT_DOUBLE_EQ(minVal_, minVal); + EXPECT_DOUBLE_EQ(maxVal_, maxVal); } } @@ -803,12 +634,8 @@ TEST_P(MinMax, MASK) double minVal_, maxVal_; cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask); - //check results - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_DOUBLE_EQ(minVal_, minVal) << sss; - EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss; + EXPECT_DOUBLE_EQ(minVal_, minVal); + EXPECT_DOUBLE_EQ(maxVal_, maxVal); } } @@ -919,17 +746,13 @@ TEST_P(MinMaxLoc, MAT) error1 = ::abs(mat1_roi.at(maxLoc_) - mat1_roi.at(maxLoc)); } - //check results - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); + EXPECT_DOUBLE_EQ(minVal_, minVal); + EXPECT_DOUBLE_EQ(maxVal_, maxVal); + EXPECT_DOUBLE_EQ(minlocVal_, minlocVal); + EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal); - EXPECT_DOUBLE_EQ(minVal_, minVal) << sss; - EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss; - EXPECT_DOUBLE_EQ(minlocVal_, minlocVal) << sss; - EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal) << sss; - - EXPECT_DOUBLE_EQ(error0, 0.0) << sss; - EXPECT_DOUBLE_EQ(error1, 0.0) << sss; + EXPECT_DOUBLE_EQ(error0, 0.0); + EXPECT_DOUBLE_EQ(error1, 0.0); } } @@ -1040,17 +863,13 @@ TEST_P(MinMaxLoc, MASK) error1 = ::abs(mat1_roi.at(maxLoc_) - mat1_roi.at(maxLoc)); } - //check results - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); + EXPECT_DOUBLE_EQ(minVal_, minVal); + EXPECT_DOUBLE_EQ(maxVal_, maxVal); + EXPECT_DOUBLE_EQ(minlocVal_, minlocVal); + EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal); - EXPECT_DOUBLE_EQ(minVal_, minVal) << sss; - EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss; - EXPECT_DOUBLE_EQ(minlocVal_, minlocVal) << sss; - EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal) << sss; - - EXPECT_DOUBLE_EQ(error0, 0.0) << sss; - EXPECT_DOUBLE_EQ(error1, 0.0) << sss; + EXPECT_DOUBLE_EQ(error0, 0.0); + EXPECT_DOUBLE_EQ(error1, 0.0); } } @@ -1064,14 +883,12 @@ TEST_P(Sum, MAT) random_roi(); Scalar cpures = cv::sum(mat1_roi); Scalar gpures = cv::ocl::sum(gmat1); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); //check results - EXPECT_NEAR(cpures[0], gpures[0], 0.1) << sss; - EXPECT_NEAR(cpures[1], gpures[1], 0.1) << sss; - EXPECT_NEAR(cpures[2], gpures[2], 0.1) << sss; - EXPECT_NEAR(cpures[3], gpures[3], 0.1) << sss; + EXPECT_NEAR(cpures[0], gpures[0], 0.1); + EXPECT_NEAR(cpures[1], gpures[1], 0.1); + EXPECT_NEAR(cpures[2], gpures[2], 0.1); + EXPECT_NEAR(cpures[3], gpures[3], 0.1); } } @@ -1086,11 +903,7 @@ TEST_P(CountNonZero, MAT) int cpures = cv::countNonZero(mat1_roi); int gpures = cv::ocl::countNonZero(gmat1); - //check results - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_DOUBLE_EQ((double)cpures, (double)gpures) << sss; + EXPECT_DOUBLE_EQ((double)cpures, (double)gpures); } } @@ -1112,13 +925,7 @@ TEST_P(Phase, Mat) random_roi(); cv::phase(mat1_roi, mat2_roi, dst_roi, angelInDegrees ? true : false); cv::ocl::phase(gmat1, gmat2, gdst, angelInDegrees ? true : false); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-2, sss); + Near(1e-2); } } } @@ -1135,13 +942,7 @@ TEST_P(Bitwise_and, Mat) cv::bitwise_and(mat1_roi, mat2_roi, dst_roi); cv::ocl::bitwise_and(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } @@ -1153,15 +954,10 @@ TEST_P(Bitwise_and, Mat_Mask) cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi); cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } + TEST_P(Bitwise_and, Scalar) { for(int j = 0; j < LOOP_TIMES; j++) @@ -1170,14 +966,7 @@ TEST_P(Bitwise_and, Scalar) cv::bitwise_and(mat1_roi, val, dst_roi); cv::ocl::bitwise_and(gmat1, val, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); - + Near(1e-5); } } @@ -1189,14 +978,7 @@ TEST_P(Bitwise_and, Scalar_Mask) cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi); cv::ocl::bitwise_and(gmat1, val, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char *sss = new char[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); - delete[] sss; + Near(1e-5); } } @@ -1214,13 +996,7 @@ TEST_P(Bitwise_or, Mat) cv::bitwise_or(mat1_roi, mat2_roi, dst_roi); cv::ocl::bitwise_or(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } @@ -1232,15 +1008,10 @@ TEST_P(Bitwise_or, Mat_Mask) cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi); cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } + TEST_P(Bitwise_or, Scalar) { for(int j = 0; j < LOOP_TIMES; j++) @@ -1249,13 +1020,7 @@ TEST_P(Bitwise_or, Scalar) cv::bitwise_or(mat1_roi, val, dst_roi); cv::ocl::bitwise_or(gmat1, val, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -1267,13 +1032,7 @@ TEST_P(Bitwise_or, Scalar_Mask) cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi); cv::ocl::bitwise_or(gmat1, val, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -1291,13 +1050,7 @@ TEST_P(Bitwise_xor, Mat) cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi); cv::ocl::bitwise_xor(gmat1, gmat2, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } @@ -1309,15 +1062,10 @@ TEST_P(Bitwise_xor, Mat_Mask) cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi); cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } + TEST_P(Bitwise_xor, Scalar) { for(int j = 0; j < LOOP_TIMES; j++) @@ -1326,13 +1074,7 @@ TEST_P(Bitwise_xor, Scalar) cv::bitwise_xor(mat1_roi, val, dst_roi); cv::ocl::bitwise_xor(gmat1, val, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -1344,13 +1086,7 @@ TEST_P(Bitwise_xor, Scalar_Mask) cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi); cv::ocl::bitwise_xor(gmat1, val, gdst, gmask); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } @@ -1367,13 +1103,7 @@ TEST_P(Bitwise_not, Mat) cv::bitwise_not(mat1_roi, dst_roi); cv::ocl::bitwise_not(gmat1, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } @@ -1390,7 +1120,7 @@ TEST_P(Compare, Mat) } int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE}; - const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"}; + //const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"}; int cmp_num = sizeof(cmp_codes) / sizeof(int); for (int i = 0; i < cmp_num; ++i) @@ -1402,13 +1132,7 @@ TEST_P(Compare, Mat) cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]); cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "cmptype=%s, roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", cmp_str[i], roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + Near(0); } } @@ -1430,14 +1154,7 @@ TEST_P(Pow, Mat) double p = 4.5; cv::pow(mat1_roi, p, dst_roi); cv::ocl::pow(gmat1, p, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss); + Near(1); } } @@ -1448,36 +1165,18 @@ TEST_P(MagnitudeSqr, Mat) { for(int j = 0; j < LOOP_TIMES; j++) { - // random_roi(); - // int64 start, end; - // start = cv::getTickCount(); + random_roi(); for(int i = 0; i < mat1.rows; ++i) for(int j = 0; j < mat1.cols; ++j) { float val1 = mat1.at(i, j); float val2 = mat2.at(i, j); - ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2; - - // float val1 =((float *)( mat1.data))[(i*mat1.step/8 +j)*2]; - // - // float val2 =((float *)( mat1.data))[(i*mat1.step/8 +j)*2+ 1 ]; - - // ((float *)(dst.data))[i*dst.step/4 +j]= val1 * val1 +val2 * val2; } - // end = cv::getTickCount(); - - - cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst; - cv::ocl::magnitudeSqr(clmat1, clmat2, cldst); - - cv::Mat cpu_dst; - cldst.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss); + cv::ocl::oclMat clmat1(mat1), clmat2(mat2); + cv::ocl::magnitudeSqr(clmat1, clmat2, gdst); + Near(1); } } @@ -1498,21 +1197,13 @@ TEST_P(AddWeighted, Mat) cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); } } - //********test**************** INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine( diff --git a/modules/ocl/test/test_blend.cpp b/modules/ocl/test/test_blend.cpp index f9c8657d0..fa1aea172 100644 --- a/modules/ocl/test/test_blend.cpp +++ b/modules/ocl/test/test_blend.cpp @@ -1,3 +1,47 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Nathan, liujun@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ #include "precomp.hpp" #include @@ -33,20 +77,14 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/) { - //std::vector oclinfo; cv::Size size; int type; bool useRoi; virtual void SetUp() { - //devInfo = GET_PARAM(0); size = GET_PARAM(0); type = GET_PARAM(1); - /*useRoi = GET_PARAM(3);*/ - - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); } }; @@ -59,12 +97,9 @@ TEST_P(Blend, Accuracy) cv::Mat weights1 = randomMat(size, CV_32F, 0, 1); cv::Mat weights2 = randomMat(size, CV_32F, 0, 1); - cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F); - cv::ocl::oclMat dst(size, type); - gimg1.upload(img1); - gimg2.upload(img2); - gweights1.upload(weights1); - gweights2.upload(weights2); + cv::ocl::oclMat gimg1(img1), gimg2(img2), gweights1(weights1), gweights2(weights2); + cv::ocl::oclMat dst; + cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst); cv::Mat result; cv::Mat result_gold; @@ -74,10 +109,10 @@ TEST_P(Blend, Accuracy) else blendLinearGold(img1, img2, weights1, weights2, result_gold); - EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f, 0); + EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend, Combine( DIFFERENT_SIZES, testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)) )); diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp index 1fbbd99fe..84e5d4c77 100644 --- a/modules/ocl/test/test_brute_force_matcher.cpp +++ b/modules/ocl/test/test_brute_force_matcher.cpp @@ -7,12 +7,16 @@ // copy or use the software. // // -// Intel License Agreement +// License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2010-2012, Multicoreware inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // +// @Authors +// Nathan, liujun@multicorewareinc.com +// // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -21,12 +25,12 @@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. +// and/or other oclMaterials provided with the distribution. // -// * The name of Intel Corporation may not be used to endorse or promote products +// * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // -// This software is provided by the copyright holders and contributors "as is" and +// This software is provided by the copyright holders and contributors as is and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, diff --git a/modules/ocl/test/test_calib3d.cpp b/modules/ocl/test/test_calib3d.cpp index b556e5a3c..14fb31f53 100644 --- a/modules/ocl/test/test_calib3d.cpp +++ b/modules/ocl/test/test_calib3d.cpp @@ -129,9 +129,69 @@ TEST_P(StereoMatchBP, Regression) bp(d_left, d_right, d_disp); d_disp.download(disp); disp.convertTo(disp, disp_gold.depth()); - EXPECT_MAT_NEAR(disp_gold, disp, 0.0, ""); + EXPECT_MAT_NEAR(disp_gold, disp, 0.0); } INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64), testing::Values(8),testing::Values(2),testing::Values(25.0f), testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f))); + +////////////////////////////////////////////////////////////////////////// +// ConstSpaceBeliefPropagation +PARAM_TEST_CASE(StereoMatchConstSpaceBP, int, int, int, int, float, float, float, float, int, int) +{ + int ndisp_; + int iters_; + int levels_; + int nr_plane_; + float max_data_term_; + float data_weight_; + float max_disc_term_; + float disc_single_jump_; + int min_disp_th_; + int msg_type_; + + virtual void SetUp() + { + ndisp_ = GET_PARAM(0); + iters_ = GET_PARAM(1); + levels_ = GET_PARAM(2); + nr_plane_ = GET_PARAM(3); + max_data_term_ = GET_PARAM(4); + data_weight_ = GET_PARAM(5); + max_disc_term_ = GET_PARAM(6); + disc_single_jump_ = GET_PARAM(7); + min_disp_th_ = GET_PARAM(8); + msg_type_ = GET_PARAM(9); + } +}; +TEST_P(StereoMatchConstSpaceBP, Regression) +{ + Mat left_image = readImage("csstereobp/aloe-L.png"); + Mat right_image = readImage("csstereobp/aloe-R.png"); + Mat disp_gold = readImage("csstereobp/aloe-disp.png", IMREAD_GRAYSCALE); + + ocl::oclMat d_left, d_right; + ocl::oclMat d_disp; + + Mat disp; + ASSERT_FALSE(left_image.empty()); + ASSERT_FALSE(right_image.empty()); + ASSERT_FALSE(disp_gold.empty()); + + d_left.upload(left_image); + d_right.upload(right_image); + + ocl::StereoConstantSpaceBP bp(ndisp_, iters_, levels_, nr_plane_, max_data_term_, data_weight_, + max_disc_term_, disc_single_jump_, 0, CV_32F); + bp(d_left, d_right, d_disp); + d_disp.download(disp); + disp.convertTo(disp, disp_gold.depth()); + + EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-4); + //EXPECT_MAT_NEAR(disp_gold, disp, 1.0, ""); +} +INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchConstSpaceBP, testing::Combine(testing::Values(128), + testing::Values(16),testing::Values(4), testing::Values(4), testing::Values(30.0f), + testing::Values(1.0f),testing::Values(160.0f), + testing::Values(10.0f), testing::Values(0), testing::Values(CV_32F))); #endif // HAVE_OPENCL diff --git a/modules/ocl/test/test_color.cpp b/modules/ocl/test/test_color.cpp index d70535dca..efc96de23 100644 --- a/modules/ocl/test/test_color.cpp +++ b/modules/ocl/test/test_color.cpp @@ -103,7 +103,7 @@ PARAM_TEST_CASE(CvtColor, cv::Size, MatDepth) cv::cvtColor(src, dst_gold, CVTCODE(name));\ cv::Mat dst_mat;\ dst.download(dst_mat);\ - EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");\ + EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);\ } //add new ones here using macro @@ -144,7 +144,7 @@ TEST_P(CvtColor_Gray2RGB, Accuracy) cv::cvtColor(src, dst_gold, code); cv::Mat dst_mat; dst.download(dst_mat); - EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, ""); + EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5); } @@ -174,7 +174,7 @@ TEST_P(CvtColor_YUV420, Accuracy) cv::Mat dst_mat; dst.download(dst_mat); MAT_DIFF(dst_mat, dst_gold); - EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, ""); + EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5); } INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor, testing::Combine( diff --git a/modules/ocl/test/test_columnsum.cpp b/modules/ocl/test/test_columnsum.cpp index 9bd2e6f8e..231f0657b 100644 --- a/modules/ocl/test/test_columnsum.cpp +++ b/modules/ocl/test/test_columnsum.cpp @@ -47,27 +47,16 @@ #include "precomp.hpp" #include -/////////////////////////////////////////////////////////////////////////////// -/// ColumnSum - #ifdef HAVE_OPENCL -//////////////////////////////////////////////////////////////////////// -// ColumnSum - -PARAM_TEST_CASE(ColumnSum, cv::Size, bool ) +PARAM_TEST_CASE(ColumnSum, cv::Size) { cv::Size size; cv::Mat src; - bool useRoi; - //std::vector oclinfo; virtual void SetUp() { size = GET_PARAM(0); - useRoi = GET_PARAM(1); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); } }; @@ -99,8 +88,7 @@ TEST_P(ColumnSum, Accuracy) } } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine( - DIFFERENT_SIZES, testing::Values(Inverse(false), Inverse(true)))); +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES); #endif diff --git a/modules/ocl/test/test_fft.cpp b/modules/ocl/test/test_fft.cpp index 030ea1ff1..3d80bc814 100644 --- a/modules/ocl/test/test_fft.cpp +++ b/modules/ocl/test/test_fft.cpp @@ -68,7 +68,7 @@ TEST_P(Dft, C2C) cv::dft(a, b_gold, dft_flags); cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), dft_flags); - EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4, ""); + EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4); } TEST_P(Dft, R2C) @@ -81,11 +81,11 @@ TEST_P(Dft, R2C) cv::dft(a, b_gold, cv::DFT_COMPLEX_OUTPUT | dft_flags); b_gold_roi = b_gold(cv::Rect(0, 0, d_b.cols, d_b.rows)); - EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4, ""); + EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4); cv::Mat c_gold; cv::dft(b_gold, c_gold, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE); - EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4, ""); + EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4); } TEST_P(Dft, R2CthenC2R) @@ -95,7 +95,7 @@ TEST_P(Dft, R2CthenC2R) cv::ocl::oclMat d_b, d_c; cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), 0); cv::ocl::dft(d_b, d_c, a.size(), cv::DFT_SCALE | cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT); - EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, ""); + EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4); } diff --git a/modules/ocl/test/test_filters.cpp b/modules/ocl/test/test_filters.cpp index 70d407944..cfd57413e 100644 --- a/modules/ocl/test/test_filters.cpp +++ b/modules/ocl/test/test_filters.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -19,6 +20,7 @@ // Jia Haipeng, jiahaipeng95@gmail.com // Zero Lin, Zero.Lin@amd.com // Zhang Ying, zhangying913@gmail.com +// Yao Wang, bitwangyaoyao@gmail.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -55,121 +57,13 @@ using namespace testing; using namespace std; -PARAM_TEST_CASE(FilterTestBase, MatType, bool) +PARAM_TEST_CASE(FilterTestBase, + MatType, + cv::Size, // kernel size + cv::Size, // dx,dy + int // border type, or iteration + ) { - int type; - cv::Scalar val; - - //src mat - cv::Mat mat1; - cv::Mat mat2; - cv::Mat mask; - cv::Mat dst; - cv::Mat dst1; //bak, for two outputs - - // set up roi - int roicols; - int roirows; - int src1x; - int src1y; - int src2x; - int src2y; - int dstx; - int dsty; - int maskx; - int masky; - - //src mat with roi - cv::Mat mat1_roi; - cv::Mat mat2_roi; - cv::Mat mask_roi; - cv::Mat dst_roi; - cv::Mat dst1_roi; //bak - //std::vector oclinfo; - //ocl dst mat for testing - cv::ocl::oclMat gdst_whole; - cv::ocl::oclMat gdst1_whole; //bak - - //ocl mat with roi - cv::ocl::oclMat gmat1; - cv::ocl::oclMat gmat2; - cv::ocl::oclMat gdst; - cv::ocl::oclMat gdst1; //bak - cv::ocl::oclMat gmask; - - virtual void SetUp() - { - type = GET_PARAM(0); - - cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - - mat1 = randomMat(rng, size, type, 5, 16, false); - mat2 = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); - dst1 = randomMat(rng, size, type, 5, 16, false); - mask = randomMat(rng, size, CV_8UC1, 0, 2, false); - - cv::threshold(mask, mask, 0.5, 255., CV_8UC1); - - val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0)); - } - - void random_roi() - { -#ifdef RANDOMROI - //randomize ROI - cv::RNG &rng = TS::ptr()->get_rng(); - roicols = rng.uniform(1, mat1.cols); - roirows = rng.uniform(1, mat1.rows); - src1x = rng.uniform(0, mat1.cols - roicols); - src1y = rng.uniform(0, mat1.rows - roirows); - src2x = rng.uniform(0, mat2.cols - roicols); - src2y = rng.uniform(0, mat2.rows - roirows); - dstx = rng.uniform(0, dst.cols - roicols); - dsty = rng.uniform(0, dst.rows - roirows); - maskx = rng.uniform(0, mask.cols - roicols); - masky = rng.uniform(0, mask.rows - roirows); -#else - roicols = mat1.cols; - roirows = mat1.rows; - src1x = 0; - src1y = 0; - src2x = 0; - src2y = 0; - dstx = 0; - dsty = 0; - maskx = 0; - masky = 0; -#endif - mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); - mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows)); - mask_roi = mask(Rect(maskx, masky, roicols, roirows)); - dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); - dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows)); - - gdst_whole = dst; - gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - - gdst1_whole = dst1; - gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows)); - - gmat1 = mat1_roi; - gmat2 = mat2_roi; - gmask = mask_roi; - } - -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// -// blur - -PARAM_TEST_CASE(Blur, MatType, cv::Size, int) -{ - int type; - cv::Size ksize; - int bordertype; - //src mat cv::Mat mat1; cv::Mat dst; @@ -185,7 +79,7 @@ PARAM_TEST_CASE(Blur, MatType, cv::Size, int) //src mat with roi cv::Mat mat1_roi; cv::Mat dst_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -193,23 +87,6 @@ PARAM_TEST_CASE(Blur, MatType, cv::Size, int) cv::ocl::oclMat gmat1; cv::ocl::oclMat gdst; - virtual void SetUp() - { - type = GET_PARAM(0); - ksize = GET_PARAM(1); - bordertype = GET_PARAM(2); - - cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - - mat1 = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); - } - void random_roi() { #ifdef RANDOMROI @@ -236,10 +113,37 @@ PARAM_TEST_CASE(Blur, MatType, cv::Size, int) gdst_whole = dst; gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - gmat1 = mat1_roi; } + void Init(int mat_type) + { + cv::Size size(MWIDTH, MHEIGHT); + mat1 = randomMat(size, mat_type, 5, 16); + dst = randomMat(size, mat_type, 5, 16); + } + + void Near(double threshold) + { + EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// blur +struct Blur : FilterTestBase +{ + int type; + cv::Size ksize; + int bordertype; + + virtual void SetUp() + { + type = GET_PARAM(0); + ksize = GET_PARAM(1); + bordertype = GET_PARAM(3); + Init(type); + } }; TEST_P(Blur, Mat) @@ -247,116 +151,36 @@ TEST_P(Blur, Mat) for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); - cv::blur(mat1_roi, dst_roi, ksize, Point(-1, -1), bordertype); cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty); - EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss); + Near(1.0); } - } - ///////////////////////////////////////////////////////////////////////////////////////////////// //Laplacian - -PARAM_TEST_CASE(LaplacianTestBase, MatType, int) +struct Laplacian : FilterTestBase { int type; - int ksize; - - //src mat - cv::Mat mat; - cv::Mat dst; - - // set up roi - int roicols; - int roirows; - int srcx; - int srcy; - int dstx; - int dsty; - - //src mat with roi - cv::Mat mat_roi; - cv::Mat dst_roi; - //std::vector oclinfo; - //ocl dst mat for testing - cv::ocl::oclMat gdst_whole; - - //ocl mat with roi - cv::ocl::oclMat gmat; - cv::ocl::oclMat gdst; + cv::Size ksize; virtual void SetUp() { type = GET_PARAM(0); ksize = GET_PARAM(1); - - cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - - mat = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); - - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); - } - - void random_roi() - { -#ifdef RANDOMROI - //randomize ROI - cv::RNG &rng = TS::ptr()->get_rng(); - roicols = rng.uniform(2, mat.cols); - roirows = rng.uniform(2, mat.rows); - srcx = rng.uniform(0, mat.cols - roicols); - srcy = rng.uniform(0, mat.rows - roirows); - dstx = rng.uniform(0, dst.cols - roicols); - dsty = rng.uniform(0, dst.rows - roirows); -#else - roicols = mat.cols; - roirows = mat.rows; - srcx = 0; - srcy = 0; - dstx = 0; - dsty = 0; -#endif - - mat_roi = mat(Rect(srcx, srcy, roicols, roirows)); - dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); - - gdst_whole = dst; - gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - - gmat = mat_roi; + Init(type); } }; -struct Laplacian : LaplacianTestBase {}; - TEST_P(Laplacian, Accuracy) { for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); - - cv::Laplacian(mat_roi, dst_roi, -1, ksize, 1); - cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, srcx, srcy, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + cv::Laplacian(mat1_roi, dst_roi, -1, ksize.width, 1); + cv::ocl::Laplacian(gmat1, gdst, -1, ksize.width, 1); + Near(1e-5); } } @@ -364,8 +188,7 @@ TEST_P(Laplacian, Accuracy) ///////////////////////////////////////////////////////////////////////////////////////////////// // erode & dilate - -PARAM_TEST_CASE(ErodeDilateBase, MatType, int) +struct ErodeDilate : FilterTestBase { int type; int iterations; @@ -373,210 +196,54 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, int) //erode or dilate kernel cv::Mat kernel; - //src mat - cv::Mat mat1; - cv::Mat dst; - - // set up roi - int roicols; - int roirows; - int src1x; - int src1y; - int dstx; - int dsty; - - //src mat with roi - cv::Mat mat1_roi; - cv::Mat dst_roi; - //std::vector oclinfo; - //ocl dst mat for testing - cv::ocl::oclMat gdst_whole; - - //ocl mat with roi - cv::ocl::oclMat gmat1; - cv::ocl::oclMat gdst; - virtual void SetUp() { type = GET_PARAM(0); - iterations = GET_PARAM(1); - - cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - - mat1 = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); + iterations = GET_PARAM(3); + Init(type); // rng.fill(kernel, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3)); - kernel = randomMat(rng, Size(3, 3), CV_8UC1, 0, 3, false); - - } - - void random_roi() - { -#ifdef RANDOMROI - //randomize ROI - cv::RNG &rng = TS::ptr()->get_rng(); - roicols = rng.uniform(2, mat1.cols); - roirows = rng.uniform(2, mat1.rows); - src1x = rng.uniform(0, mat1.cols - roicols); - src1y = rng.uniform(0, mat1.rows - roirows); - dstx = rng.uniform(0, dst.cols - roicols); - dsty = rng.uniform(0, dst.rows - roirows); -#else - roicols = mat1.cols; - roirows = mat1.rows; - src1x = 0; - src1y = 0; - dstx = 0; - dsty = 0; -#endif - - mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); - dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); - - gdst_whole = dst; - gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - - - gmat1 = mat1_roi; + kernel = randomMat(Size(3, 3), CV_8UC1, 0, 3); } }; -// erode - -struct Erode : ErodeDilateBase {}; - -TEST_P(Erode, Mat) -{ - for(int j = 0; j < LOOP_TIMES; j++) - { - random_roi(); - - cv::erode(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations); - cv::ocl::erode(gmat1, gdst, kernel, Point(-1, -1), iterations); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); - } - -} - - - - - -// dilate - -struct Dilate : ErodeDilateBase {}; - -TEST_P(Dilate, Mat) +TEST_P(ErodeDilate, Mat) { for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); cv::erode(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations); cv::ocl::erode(gmat1, gdst, kernel, Point(-1, -1), iterations); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss); + Near(1e-5); + } + for(int j = 0; j < LOOP_TIMES; j++) + { + random_roi(); + cv::dilate(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations); + cv::ocl::dilate(gmat1, gdst, kernel, Point(-1, -1), iterations); + Near(1e-5); } - } - - ///////////////////////////////////////////////////////////////////////////////////////////////// // Sobel - -PARAM_TEST_CASE(Sobel, MatType, int, int, int, int) +struct Sobel : FilterTestBase { int type; int dx, dy, ksize, bordertype; - //src mat - cv::Mat mat1; - cv::Mat dst; - - // set up roi - int roicols; - int roirows; - int src1x; - int src1y; - int dstx; - int dsty; - - //src mat with roi - cv::Mat mat1_roi; - cv::Mat dst_roi; - //std::vector oclinfo; - //ocl dst mat for testing - cv::ocl::oclMat gdst_whole; - - //ocl mat with roi - cv::ocl::oclMat gmat1; - cv::ocl::oclMat gdst; - virtual void SetUp() { type = GET_PARAM(0); - dx = GET_PARAM(1); - dy = GET_PARAM(2); - ksize = GET_PARAM(3); - bordertype = GET_PARAM(4); - - - cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - - mat1 = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); - - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); + Size s = GET_PARAM(1); + ksize = s.width; + s = GET_PARAM(2); + dx = s.width; + dy = s.height; + bordertype = GET_PARAM(3); + Init(type); } - - void random_roi() - { -#ifdef RANDOMROI - //randomize ROI - cv::RNG &rng = TS::ptr()->get_rng(); - roicols = rng.uniform(2, mat1.cols); - roirows = rng.uniform(2, mat1.rows); - src1x = rng.uniform(0, mat1.cols - roicols); - src1y = rng.uniform(0, mat1.rows - roirows); - dstx = rng.uniform(0, dst.cols - roicols); - dsty = rng.uniform(0, dst.rows - roirows); -#else - roicols = mat1.cols; - roirows = mat1.rows; - src1x = 0; - src1y = 0; - dstx = 0; - dsty = 0; -#endif - - mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); - dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); - - gdst_whole = dst; - gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - - - gmat1 = mat1_roi; - } - }; TEST_P(Sobel, Mat) @@ -584,103 +251,29 @@ TEST_P(Sobel, Mat) for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); - cv::Sobel(mat1_roi, dst_roi, -1, dx, dy, ksize, /*scale*/0.00001,/*delta*/0, bordertype); cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss); + Near(1); } - } ///////////////////////////////////////////////////////////////////////////////////////////////// // Scharr - -PARAM_TEST_CASE(Scharr, MatType, int, int, int) +struct Scharr : FilterTestBase { int type; int dx, dy, bordertype; - //src mat - cv::Mat mat1; - cv::Mat dst; - - // set up roi - int roicols; - int roirows; - int src1x; - int src1y; - int dstx; - int dsty; - - //src mat with roi - cv::Mat mat1_roi; - cv::Mat dst_roi; - //std::vector oclinfo; - //ocl dst mat for testing - cv::ocl::oclMat gdst_whole; - - //ocl mat with roi - cv::ocl::oclMat gmat1; - cv::ocl::oclMat gdst; - virtual void SetUp() { type = GET_PARAM(0); - dx = GET_PARAM(1); - dy = GET_PARAM(2); + Size s = GET_PARAM(2); + dx = s.width; + dy = s.height; bordertype = GET_PARAM(3); - dx = 1; - dy = 0; - - cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - - mat1 = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); - - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); + Init(type); } - - void random_roi() - { -#ifdef RANDOMROI - //randomize ROI - cv::RNG &rng = TS::ptr()->get_rng(); - roicols = rng.uniform(2, mat1.cols); - roirows = rng.uniform(2, mat1.rows); - src1x = rng.uniform(0, mat1.cols - roicols); - src1y = rng.uniform(0, mat1.rows - roirows); - dstx = rng.uniform(0, dst.cols - roicols); - dsty = rng.uniform(0, dst.rows - roirows); -#else - roicols = mat1.cols; - roirows = mat1.rows; - src1x = 0; - src1y = 0; - dstx = 0; - dsty = 0; -#endif - - mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); - dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); - - gdst_whole = dst; - gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - - - gmat1 = mat1_roi; - } - }; TEST_P(Scharr, Mat) @@ -688,16 +281,9 @@ TEST_P(Scharr, Mat) for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); - cv::Scharr(mat1_roi, dst_roi, -1, dx, dy, /*scale*/1,/*delta*/0, bordertype); cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss); + Near(1); } } @@ -705,89 +291,23 @@ TEST_P(Scharr, Mat) ///////////////////////////////////////////////////////////////////////////////////////////////// // GaussianBlur - -PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int) +struct GaussianBlur : FilterTestBase { int type; cv::Size ksize; int bordertype; - double sigma1, sigma2; - //src mat - cv::Mat mat1; - cv::Mat dst; - - // set up roi - int roicols; - int roirows; - int src1x; - int src1y; - int dstx; - int dsty; - - //src mat with roi - cv::Mat mat1_roi; - cv::Mat dst_roi; - //std::vector oclinfo; - //ocl dst mat for testing - cv::ocl::oclMat gdst_whole; - - //ocl mat with roi - cv::ocl::oclMat gmat1; - cv::ocl::oclMat gdst; - virtual void SetUp() { type = GET_PARAM(0); ksize = GET_PARAM(1); - bordertype = GET_PARAM(2); - + bordertype = GET_PARAM(3); + Init(type); cv::RNG &rng = TS::ptr()->get_rng(); - cv::Size size(MWIDTH, MHEIGHT); - sigma1 = rng.uniform(0.1, 1.0); sigma2 = rng.uniform(0.1, 1.0); - - mat1 = randomMat(rng, size, type, 5, 16, false); - dst = randomMat(rng, size, type, 5, 16, false); - - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } - - void random_roi() - { -#ifdef RANDOMROI - //randomize ROI - cv::RNG &rng = TS::ptr()->get_rng(); - roicols = rng.uniform(2, mat1.cols); - roirows = rng.uniform(2, mat1.rows); - src1x = rng.uniform(0, mat1.cols - roicols); - src1y = rng.uniform(0, mat1.rows - roirows); - dstx = rng.uniform(0, dst.cols - roicols); - dsty = rng.uniform(0, dst.rows - roirows); -#else - roicols = mat1.cols; - roirows = mat1.rows; - src1x = 0; - src1y = 0; - dstx = 0; - dsty = 0; -#endif - - mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); - dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); - - gdst_whole = dst; - gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); - - - gmat1 = mat1_roi; - } - }; TEST_P(GaussianBlur, Mat) @@ -795,53 +315,53 @@ TEST_P(GaussianBlur, Mat) for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); - cv::GaussianBlur(mat1_roi, dst_roi, ksize, sigma1, sigma2, bordertype); cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype); - - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss); + Near(1); } } -INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), +INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine( + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)), + Values(Size(0, 0)), //not use Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101))); INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine( - Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), - Values(1, 3))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), + Values(Size(3, 3)), + Values(Size(0, 0)), //not use + Values(0))); //not use -INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1))); - -//INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false))); - -INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1))); - -//INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false))); +INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine( + Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), + Values(Size(0, 0)), //not use + Values(Size(0, 0)), //not use + Values(1))); -INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), - Values(1, 2), Values(0, 1), Values(3, 5), Values((MatType)cv::BORDER_CONSTANT, - (MatType)cv::BORDER_REPLICATE))); +INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine( + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), + Values(Size(3, 3), Size(5, 5)), + Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), + Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE))); INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine( - Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), Values(0, 1), Values(0, 1), - Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), + Values(Size(0, 0)), //not use + Values(Size(0, 1), Size(1, 0)), + Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE))); INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine( - Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), - Values(cv::Size(3, 3), cv::Size(5, 5)), - Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), + Values(Size(3, 3), Size(5, 5)), + Values(Size(0, 0)), //not use + Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE))); diff --git a/modules/ocl/test/test_gemm.cpp b/modules/ocl/test/test_gemm.cpp index c26a8481f..a5d90ff01 100644 --- a/modules/ocl/test/test_gemm.cpp +++ b/modules/ocl/test/test_gemm.cpp @@ -53,13 +53,12 @@ PARAM_TEST_CASE(Gemm, int, cv::Size, int) int type; cv::Size mat_size; int flags; - //vector info; + virtual void SetUp() { type = GET_PARAM(0); mat_size = GET_PARAM(1); flags = GET_PARAM(2); - //cv::ocl::getDevice(info); } }; diff --git a/modules/ocl/test/test_haar.cpp b/modules/ocl/test/test_haar.cpp index 905160c80..652109d75 100644 --- a/modules/ocl/test/test_haar.cpp +++ b/modules/ocl/test/test_haar.cpp @@ -12,10 +12,12 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors // Jia Haipeng, jiahaipeng95@gmail.com +// Sen Liu, swjutls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -61,40 +63,31 @@ struct getRect } }; -PARAM_TEST_CASE(HaarTestBase, int, int) +PARAM_TEST_CASE(Haar, double, int) { - //std::vector oclinfo; cv::ocl::OclCascadeClassifier cascade, nestedCascade; + cv::ocl::OclCascadeClassifierBuf cascadebuf; cv::CascadeClassifier cpucascade, cpunestedCascade; - // Mat img; double scale; - int index; + int flags; virtual void SetUp() { - scale = 1.0; - index = 0; + scale = GET_PARAM(0); + flags = GET_PARAM(1); string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml"; - if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName))) + if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName ))) { cout << "ERROR: Could not load classifier cascade" << endl; return; } - //int devnums = getDevice(oclinfo); - //CV_Assert(devnums>0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); - //cv::ocl::setBinpath("E:\\"); } }; ////////////////////////////////faceDetect///////////////////////////////////////////////// - -struct Haar : HaarTestBase {}; - -TEST_F(Haar, FaceDetect) +TEST_P(Haar, FaceDetect) { string imgName = workdir + "lena.jpg"; Mat img = imread( imgName, 1 ); @@ -105,59 +98,65 @@ TEST_F(Haar, FaceDetect) return ; } - //int i = 0; - //double t = 0; vector faces, oclfaces; - // const static Scalar colors[] = { CV_RGB(0, 0, 255), - // CV_RGB(0, 128, 255), - // CV_RGB(0, 255, 255), - // CV_RGB(0, 255, 0), - // CV_RGB(255, 128, 0), - // CV_RGB(255, 255, 0), - // CV_RGB(255, 0, 0), - // CV_RGB(255, 0, 255) - // } ; - Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 ); MemStorage storage(cvCreateMemStorage(0)); cvtColor( img, gray, COLOR_BGR2GRAY ); resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR ); equalizeHist( smallImg, smallImg ); - cv::ocl::oclMat image; CvSeq *_objects; image.upload(smallImg); _objects = cascade.oclHaarDetectObjects( image, storage, 1.1, - 3, 0 - | CV_HAAR_SCALE_IMAGE - , Size(30, 30), Size(0, 0) ); + 3, flags, Size(30, 30), Size(0, 0) ); vector vecAvgComp; Seq(_objects).copyTo(vecAvgComp); oclfaces.resize(vecAvgComp.size()); std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect()); - cpucascade.detectMultiScale( smallImg, faces, 1.1, - 3, 0 - | CV_HAAR_SCALE_IMAGE - , Size(30, 30), Size(0, 0) ); + cpucascade.detectMultiScale( smallImg, faces, 1.1, 3, + flags, + Size(30, 30), Size(0, 0) ); EXPECT_EQ(faces.size(), oclfaces.size()); - /* for( vector::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) - { - Mat smallImgROI; - Point center; - Scalar color = colors[i%8]; - int radius; - center.x = cvRound((r->x + r->width*0.5)*scale); - center.y = cvRound((r->y + r->height*0.5)*scale); - radius = cvRound((r->width + r->height)*0.25*scale); - circle( img, center, radius, color, 3, 8, 0 ); - } */ - //namedWindow("result"); - //imshow("result",img); - //waitKey(0); - //destroyAllWindows(); - } + +TEST_P(Haar, FaceDetectUseBuf) +{ + string imgName = workdir + "lena.jpg"; + Mat img = imread( imgName, 1 ); + + if(img.empty()) + { + std::cout << "Couldn't read " << imgName << std::endl; + return ; + } + + vector faces, oclfaces; + + Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 ); + MemStorage storage(cvCreateMemStorage(0)); + cvtColor( img, gray, CV_BGR2GRAY ); + resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR ); + equalizeHist( smallImg, smallImg ); + + cv::ocl::oclMat image; + image.upload(smallImg); + + cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3, + flags, + Size(30, 30), Size(0, 0) ); + cascadebuf.release(); + + cpucascade.detectMultiScale( smallImg, faces, 1.1, 3, + flags, + Size(30, 30), Size(0, 0) ); + EXPECT_EQ(faces.size(), oclfaces.size()); +} + +INSTANTIATE_TEST_CASE_P(FaceDetect, Haar, + Combine(Values(1.0), + Values(CV_HAAR_SCALE_IMAGE, 0))); + #endif // HAVE_OPENCL diff --git a/modules/ocl/test/test_hog.cpp b/modules/ocl/test/test_hog.cpp index f064ee3a7..e968d0444 100644 --- a/modules/ocl/test/test_hog.cpp +++ b/modules/ocl/test/test_hog.cpp @@ -240,12 +240,11 @@ TEST_P(HOG, Detect) } } - char s[100] = {0}; - EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3, s); + EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HOG, testing::Combine( +INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine( testing::Values(cv::Size(64, 128), cv::Size(48, 96)), testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)))); diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index 5ec5c152d..7c8b5c829 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -327,7 +328,7 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType, MatType, MatType, MatType, MatType, bo cv::Mat mask_roi; cv::Mat dst_roi; cv::Mat dst1_roi; //bak - //std::vector oclinfo; + //ocl mat cv::ocl::oclMat clmat1; cv::ocl::oclMat clmat2; @@ -352,10 +353,6 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType, MatType, MatType, MatType, MatType, bo cv::RNG &rng = TS::ptr()->get_rng(); cv::Size size(MWIDTH, MHEIGHT); double min = 1, max = 20; - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); if(type1 != nulltype) { @@ -445,6 +442,13 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType, MatType, MatType, MatType, MatType, bo clmask_roi = clmask(Rect(maskx, masky, roicols, roirows)); } } + + void Near(double threshold) + { + cv::Mat cpu_cldst; + cldst.download(cpu_cldst); + EXPECT_MAT_NEAR(dst, cpu_cldst, threshold); + } }; ////////////////////////////////equalizeHist////////////////////////////////////////// @@ -464,11 +468,7 @@ TEST_P(equalizeHist, Mat) random_roi(); cv::equalizeHist(mat1_roi, dst_roi); cv::ocl::equalizeHist(clmat1_roi, cldst_roi); - cv::Mat cpu_cldst; - cldst.download(cpu_cldst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,dst1x=%d,dst1y=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, dst1x, dst1y, maskx, masky, src2x, src2y); - EXPECT_MAT_NEAR(dst, cpu_cldst, 1.1, sss); + Near(1.1); } } } @@ -488,7 +488,7 @@ TEST_P(bilateralFilter, Mat) int d = 2 * radius + 1; double sigmaspace = 20.0; int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT, cv::BORDER_WRAP, cv::BORDER_REFLECT_101}; - const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"}; + //const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"}; if (mat1.depth() != CV_8U || mat1.type() != dst.type()) { @@ -517,25 +517,7 @@ TEST_P(bilateralFilter, Mat) cv::bilateralFilter(mat1_roi, dst_roi, d, sigmacolor, sigmaspace, bordertype[i] | cv::BORDER_ISOLATED); cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d, sigmacolor, sigmaspace, bordertype[i] | cv::BORDER_ISOLATED); - - cv::Mat cpu_cldst; - cldst.download(cpu_cldst); - - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,radius=%d,boredertype=%s", roicols, roirows, src1x, src1y, dstx, dsty, radius, borderstr[i]); - //for(int i=0;i(i,j)!=cpu_cldst.at(i,j)) - // cout<< i <<" "<< j <<" "<< (int)dst.at(i,j)<<" "<< (int)cpu_cldst.at(i,j)<<" "; - // } - // cout<get_rng(); int top = rng.uniform(0, 10); int bottom = rng.uniform(0, 10); @@ -587,24 +569,12 @@ TEST_P(CopyMakeBorder, Mat) cv::Mat cpu_cldst; #ifndef RANDOMROI cldst_roi.download(cpu_cldst); + EXPECT_MAT_NEAR(dst_roi, cpu_cldst, 0.0); #else cldst.download(cpu_cldst); + EXPECT_MAT_NEAR(dst, cpu_cldst, 0.0); #endif - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,dst1x=%d,dst1y=%d,top=%d,bottom=%d,left=%d,right=%d, bordertype=%s", roicols, roirows, src1x, src1y, dstx, dsty, dst1x, dst1y, top, bottom, left, right, borderstr[i]); -#ifndef RANDOMROI - EXPECT_MAT_NEAR(dst_roi, cpu_cldst, 0.0, sss); -#else - //for(int i=0;i(i,j)<<" "; - //} - //cout< oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -740,10 +693,6 @@ PARAM_TEST_CASE(WarpTestBase, MatType, int) mat1 = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -805,10 +754,7 @@ TEST_P(WarpAffine, Mat) cv::Mat cpu_dst; gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", src_roicols, src_roirows, dst_roicols, dst_roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss); + EXPECT_MAT_NEAR(dst, cpu_dst, 1.0); } } @@ -837,10 +783,7 @@ TEST_P(WarpPerspective, Mat) cv::Mat cpu_dst; gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", src_roicols, src_roirows, dst_roicols, dst_roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss); + EXPECT_MAT_NEAR(dst, cpu_dst, 1.0); } } @@ -905,9 +848,6 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) interpolation = GET_PARAM(3); bordertype = GET_PARAM(4); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - cv::RNG &rng = TS::ptr()->get_rng(); cv::Size srcSize = cv::Size(MWIDTH, MHEIGHT); cv::Size map1Size = cv::Size(MWIDTH, MHEIGHT); @@ -1004,7 +944,7 @@ TEST_P(Remap, Mat) return; } int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; - const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; + //const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; // for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++) for(int j = 0; j < LOOP_TIMES; j++) { @@ -1014,13 +954,9 @@ TEST_P(Remap, Mat) cv::Mat cpu_dst; gdst.download(cpu_dst); - char sss[1024]; - sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d bordertype=%s", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty, borderstr[0]); - - if(interpolation == 0) - EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss); - EXPECT_MAT_NEAR(dst, cpu_dst, 2.0, sss); + EXPECT_MAT_NEAR(dst, cpu_dst, 1.0); + EXPECT_MAT_NEAR(dst, cpu_dst, 2.0); } } @@ -1051,7 +987,6 @@ PARAM_TEST_CASE(Resize, MatType, cv::Size, double, double, int) int dstx; int dsty; - //std::vector oclinfo; //src mat with roi cv::Mat mat1_roi; cv::Mat dst_roi; @@ -1090,10 +1025,6 @@ PARAM_TEST_CASE(Resize, MatType, cv::Size, double, double, int) mat1 = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, dsize, type, 5, 16, false); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -1149,10 +1080,7 @@ TEST_P(Resize, Mat) cv::Mat cpu_dst; gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", src_roicols, src_roirows, dst_roicols, dst_roirows, src1x, src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss); + EXPECT_MAT_NEAR(dst, cpu_dst, 1.0); } } @@ -1181,7 +1109,7 @@ PARAM_TEST_CASE(Threshold, MatType, ThreshOp) //src mat with roi cv::Mat mat1_roi; cv::Mat dst_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -1199,11 +1127,6 @@ PARAM_TEST_CASE(Threshold, MatType, ThreshOp) mat1 = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); - - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -1251,12 +1174,7 @@ TEST_P(Threshold, Mat) cv::Mat cpu_dst; gdst_whole.download(cpu_dst); - - //EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5) - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x , src1y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss); + EXPECT_MAT_NEAR(dst, cpu_dst, 1); } } @@ -1288,7 +1206,6 @@ PARAM_TEST_CASE(meanShiftTestBase, MatType, MatType, int, int, cv::TermCriteria) cv::ocl::oclMat gdst; cv::ocl::oclMat gdstCoor; - //std::vector oclinfo; //ocl mat with roi cv::ocl::oclMat gsrc_roi; cv::ocl::oclMat gdst_roi; @@ -1311,10 +1228,6 @@ PARAM_TEST_CASE(meanShiftTestBase, MatType, MatType, int, int, cv::TermCriteria) dst = randomMat(rng, size, type, 5, 16, false); dstCoor = randomMat(rng, size, typeCoor, 5, 16, false); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -1367,11 +1280,7 @@ TEST_P(meanShiftFiltering, Mat) cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit); gdst.download(cpu_gdst); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty); - EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss); - + EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0); } } @@ -1393,11 +1302,8 @@ TEST_P(meanShiftProc, Mat) gdst.download(cpu_gdst); gdstCoor.download(cpu_gdstCoor); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty); - EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss); - EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0, sss); + EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0); + EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0); } } @@ -1436,7 +1342,6 @@ PARAM_TEST_CASE(histTestBase, MatType, MatType) cv::ocl::oclMat gdst_hist; //ocl mat with roi cv::ocl::oclMat gsrc_roi; - // std::vector oclinfo; virtual void SetUp() { @@ -1447,10 +1352,6 @@ PARAM_TEST_CASE(histTestBase, MatType, MatType) src = randomMat(rng, size, type_src, 0, 256, false); - // int devnums = getDevice(oclinfo); - // CV_Assert(devnums > 0); - //if you want to use undefault device, set it here - //setDevice(oclinfo[0]); } void random_roi() @@ -1489,10 +1390,7 @@ TEST_P(calcHist, Mat) cv::ocl::calcHist(gsrc_roi, gdst_hist); gdst_hist.download(cpu_hist); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d\n", roicols, roirows, srcx, srcy); - EXPECT_MAT_NEAR(dst_hist, cpu_hist, 0.0, sss); + EXPECT_MAT_NEAR(dst_hist, cpu_hist, 0.0); } } @@ -1629,11 +1527,7 @@ TEST_P(Convolve, Mat) cv::Mat cpu_dst; gdst_whole.download(cpu_dst); - - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_dst, 1e-1, sss); + EXPECT_MAT_NEAR(dst, cpu_dst, .1); } } diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp index 5da7f01cd..a393abdeb 100644 --- a/modules/ocl/test/test_match_template.cpp +++ b/modules/ocl/test/test_match_template.cpp @@ -62,7 +62,6 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho cv::Size templ_size; int cn; int method; - //std::vector oclinfo; virtual void SetUp() { @@ -70,8 +69,6 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho templ_size = GET_PARAM(1); cn = GET_PARAM(2); method = GET_PARAM(3); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); } }; @@ -92,12 +89,10 @@ TEST_P(MatchTemplate8U, Accuracy) cv::Mat dst_gold; cv::matchTemplate(image, templ, dst_gold, method); - char sss [100] = ""; - cv::Mat mat_dst; dst.download(mat_dst); - EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); + EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1); } PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMethod) @@ -114,8 +109,6 @@ PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMeth templ_size = GET_PARAM(1); cn = GET_PARAM(2); method = GET_PARAM(3); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); } }; @@ -130,12 +123,10 @@ TEST_P(MatchTemplate32F, Accuracy) cv::Mat dst_gold; cv::matchTemplate(image, templ, dst_gold, method); - char sss [100] = ""; - cv::Mat mat_dst; dst.download(mat_dst); - EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); + EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1); } INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U, diff --git a/modules/ocl/test/test_matrix_operation.cpp b/modules/ocl/test/test_matrix_operation.cpp index ef11aaa13..92d810818 100644 --- a/modules/ocl/test/test_matrix_operation.cpp +++ b/modules/ocl/test/test_matrix_operation.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -72,7 +73,7 @@ PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType) //src mat with roi cv::Mat mat_roi; cv::Mat dst_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -90,11 +91,6 @@ PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType) mat = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); - //std::vector oclinfo; - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -139,12 +135,7 @@ TEST_P(ConvertTo, Accuracy) mat_roi.convertTo(dst_roi, dst_type); gmat.convertTo(gdst, dst_type); - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx =%d,srcy=%d,dstx=%d,dsty=%d", roicols, roirows, srcx , srcy, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0); } } @@ -175,7 +166,7 @@ PARAM_TEST_CASE(CopyToTestBase, MatType, bool) cv::Mat mat_roi; cv::Mat mask_roi; cv::Mat dst_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -197,10 +188,6 @@ PARAM_TEST_CASE(CopyToTestBase, MatType, bool) cv::threshold(mask, mask, 0.5, 255., CV_8UC1); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -250,12 +237,7 @@ TEST_P(CopyTo, Without_mask) mat_roi.copyTo(dst_roi); gmat.copyTo(gdst); - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx =%d,srcy=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d", roicols, roirows, srcx , srcy, dstx, dsty, maskx, masky); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0); } } @@ -268,12 +250,7 @@ TEST_P(CopyTo, With_mask) mat_roi.copyTo(dst_roi, mask_roi); gmat.copyTo(gdst, gmask); - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx =%d,srcy=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d", roicols, roirows, srcx , srcy, dstx, dsty, maskx, masky); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0); } } @@ -301,7 +278,7 @@ PARAM_TEST_CASE(SetToTestBase, MatType, bool) //src mat with roi cv::Mat mat_roi; cv::Mat mask_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gmat_whole; @@ -322,10 +299,6 @@ PARAM_TEST_CASE(SetToTestBase, MatType, bool) cv::threshold(mask, mask, 0.5, 255., CV_8UC1); val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0)); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -369,12 +342,7 @@ TEST_P(SetTo, Without_mask) mat_roi.setTo(val); gmat.setTo(val); - cv::Mat cpu_dst; - gmat_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx =%d,srcy=%d,maskx=%d,masky=%d", roicols, roirows, srcx , srcy, maskx, masky); - - EXPECT_MAT_NEAR(mat, cpu_dst, 1., sss); + EXPECT_MAT_NEAR(mat, Mat(gmat_whole), 1.); } } @@ -387,12 +355,7 @@ TEST_P(SetTo, With_mask) mat_roi.setTo(val, mask_roi); gmat.setTo(val, gmask); - cv::Mat cpu_dst; - gmat_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,srcx =%d,srcy=%d,maskx=%d,masky=%d", roicols, roirows, srcx , srcy, maskx, masky); - - EXPECT_MAT_NEAR(mat, cpu_dst, 1., sss); + EXPECT_MAT_NEAR(mat, Mat(gmat_whole), 1.); } } @@ -417,7 +380,7 @@ PARAM_TEST_CASE(convertC3C4, MatType, cv::Size) //src mat with roi cv::Mat mat1_roi; cv::Mat dst_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -430,13 +393,6 @@ PARAM_TEST_CASE(convertC3C4, MatType, cv::Size) type = GET_PARAM(0); ksize = GET_PARAM(1); - - - //dst = randomMat(rng, size, type, 5, 16, false); - //int devnums = getDevice(oclinfo); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[1]); } void random_roi() @@ -483,11 +439,8 @@ TEST_P(convertC3C4, Accuracy) mat1 = randomMat(rng, size, type, 0, 40, false); gmat1 = mat1; - cv::Mat cpu_dst; - gmat1.download(cpu_dst); - char sss[1024]; - sprintf(sss, "cols=%d,rows=%d", mat1.cols, mat1.rows); - EXPECT_MAT_NEAR(mat1, cpu_dst, 0.0, sss); + + EXPECT_MAT_NEAR(mat1, Mat(gmat1), 0.0); } } diff --git a/modules/ocl/test/test_moments.cpp b/modules/ocl/test/test_moments.cpp index 47da0fd7d..23a9a8eb2 100644 --- a/modules/ocl/test/test_moments.cpp +++ b/modules/ocl/test/test_moments.cpp @@ -9,7 +9,7 @@ using namespace cvtest; using namespace testing; using namespace std; extern string workdir; -PARAM_TEST_CASE(MomentsTestBase, MatType, bool) +PARAM_TEST_CASE(MomentsTest, MatType, bool) { int type; cv::Mat mat1; @@ -29,13 +29,13 @@ PARAM_TEST_CASE(MomentsTestBase, MatType, bool) Mat gpu_dst, cpu_dst; HuMoments(cpu, cpu_dst); HuMoments(gpu, gpu_dst); - EXPECT_MAT_NEAR(gpu_dst,cpu_dst, .5, ""); + EXPECT_MAT_NEAR(gpu_dst,cpu_dst, .5); } }; -struct ocl_Moments : MomentsTestBase {}; -TEST_P(ocl_Moments, Mat) + +TEST_P(MomentsTest, Mat) { bool binaryImage = 0; SetUp(); @@ -66,6 +66,6 @@ TEST_P(ocl_Moments, Mat) } } -INSTANTIATE_TEST_CASE_P(Moments, ocl_Moments, Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine( Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false))); #endif // HAVE_OPENCL diff --git a/modules/ocl/test/test_pyrlk.cpp b/modules/ocl/test/test_optflow.cpp similarity index 72% rename from modules/ocl/test/test_pyrlk.cpp rename to modules/ocl/test/test_optflow.cpp index 7c747ee4f..b08d33a08 100644 --- a/modules/ocl/test/test_pyrlk.cpp +++ b/modules/ocl/test/test_optflow.cpp @@ -1,4 +1,4 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // @@ -7,12 +7,16 @@ // copy or use the software. // // -// Intel License Agreement +// License Agreement // For Open Source Computer Vision Library -// -// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // +// @Authors +// +// // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -21,9 +25,9 @@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. +// and/or other oclMaterials provided with the distribution. // -// * The name of Intel Corporation may not be used to endorse or promote products +// * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and @@ -50,20 +54,49 @@ using namespace cvtest; using namespace testing; using namespace std; -//#define DUMP - -///////////////////////////////////////////////////////////////////////////////////////////////// -// BroxOpticalFlow extern string workdir; -#define BROX_OPTICAL_FLOW_DUMP_FILE "opticalflow/brox_optical_flow.bin" -#define BROX_OPTICAL_FLOW_DUMP_FILE_CC20 "opticalflow/brox_optical_flow_cc20.bin" +////////////////////////////////////////////////////////////////////////// +PARAM_TEST_CASE(TVL1, bool) +{ + bool useRoi; + + virtual void SetUp() + { + useRoi = GET_PARAM(0); + } + +}; + +TEST_P(TVL1, Accuracy) +{ + cv::Mat frame0 = readImage(workdir + "../gpu/rubberwhale1.png", cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage(workdir + "../gpu/rubberwhale2.png", cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame1.empty()); + + cv::ocl::OpticalFlowDual_TVL1_OCL d_alg; + cv::RNG &rng = TS::ptr()->get_rng(); + cv::Mat flowx = randomMat(rng, frame0.size(), CV_32FC1, 0, 0, useRoi); + cv::Mat flowy = randomMat(rng, frame0.size(), CV_32FC1, 0, 0, useRoi); + cv::ocl::oclMat d_flowx(flowx), d_flowy(flowy); + d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy); + + cv::Ptr alg = cv::createOptFlow_DualTVL1(); + cv::Mat flow; + alg->calc(frame0, frame1, flow); + cv::Mat gold[2]; + cv::split(flow, gold); + + EXPECT_MAT_SIMILAR(gold[0], d_flowx, 3e-3); + EXPECT_MAT_SIMILAR(gold[1], d_flowy, 3e-3); +} +INSTANTIATE_TEST_CASE_P(OCL_Video, TVL1, Values(true, false)); ///////////////////////////////////////////////////////////////////////////////////////////////// // PyrLKOpticalFlow -//IMPLEMENT_PARAM_CLASS(UseGray, bool) - PARAM_TEST_CASE(Sparse, bool, bool) { bool useGray; @@ -72,7 +105,7 @@ PARAM_TEST_CASE(Sparse, bool, bool) virtual void SetUp() { UseSmart = GET_PARAM(0); - useGray = GET_PARAM(0); + useGray = GET_PARAM(1); } }; @@ -159,9 +192,9 @@ TEST_P(Sparse, Mat) } -INSTANTIATE_TEST_CASE_P(Video, Sparse, Combine( - Values(false, true), - Values(false))); +INSTANTIATE_TEST_CASE_P(OCL_Video, Sparse, Combine( + Values(false, true), + Values(false, true))); #endif // HAVE_OPENCL diff --git a/modules/ocl/test/test_pyrdown.cpp b/modules/ocl/test/test_pyrdown.cpp index c7233cc78..6d00fb5e4 100644 --- a/modules/ocl/test/test_pyrdown.cpp +++ b/modules/ocl/test/test_pyrdown.cpp @@ -65,15 +65,6 @@ PARAM_TEST_CASE(PyrDown, MatType, int) { type = GET_PARAM(0); channels = GET_PARAM(1); - - //int devnums = getDevice(oclinfo); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); - } - - void Cleanup() - { } }; @@ -92,17 +83,11 @@ TEST_P(PyrDown, Mat) cv::pyrDown(src, dst_cpu); cv::ocl::pyrDown(gsrc, gdst); - cv::Mat dst; - gdst.download(dst); - char s[1024] = {0}; - - EXPECT_MAT_NEAR(dst, dst_cpu, dst.depth() == CV_32F ? 1e-4f : 1.0f, s); - - Cleanup(); + EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f); } } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine( Values(CV_8U, CV_32F), Values(1, 3, 4))); diff --git a/modules/ocl/test/test_pyrup.cpp b/modules/ocl/test/test_pyrup.cpp index 2e0cb92cc..3c3c6ef47 100644 --- a/modules/ocl/test/test_pyrup.cpp +++ b/modules/ocl/test/test_pyrup.cpp @@ -57,12 +57,9 @@ PARAM_TEST_CASE(PyrUp, MatType, int) { int type; int channels; - //std::vector oclinfo; virtual void SetUp() { - //int devnums = cv::ocl::getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); type = GET_PARAM(0); channels = GET_PARAM(1); } @@ -79,17 +76,14 @@ TEST_P(PyrUp, Accuracy) ocl::oclMat dst; ocl::oclMat srcMat(src); ocl::pyrUp(srcMat, dst); - Mat cpu_dst; - dst.download(cpu_dst); - char s[100] = {0}; - EXPECT_MAT_NEAR(dst_gold, cpu_dst, (src.depth() == CV_32F ? 1e-4f : 1.0), s); + EXPECT_MAT_NEAR(dst_gold, Mat(dst), (type == CV_32F ? 1e-4f : 1.0)); } } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine( Values(CV_8U, CV_32F), Values(1, 3, 4))); diff --git a/modules/ocl/test/test_split_merge.cpp b/modules/ocl/test/test_split_merge.cpp index f41d16eee..854ce309c 100644 --- a/modules/ocl/test/test_split_merge.cpp +++ b/modules/ocl/test/test_split_merge.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -87,7 +88,7 @@ PARAM_TEST_CASE(MergeTestBase, MatType, int) //dst mat with roi cv::Mat dst_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst_whole; @@ -112,10 +113,6 @@ PARAM_TEST_CASE(MergeTestBase, MatType, int) mat4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false); dst = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -205,12 +202,7 @@ TEST_P(Merge, Accuracy) cv::merge(dev_src, dst_roi); cv::ocl::merge(dev_gsrc, gdst); - cv::Mat cpu_dst; - gdst_whole.download(cpu_dst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x =%d,src1y=%d,src2x =%d,src2y=%d,src3x =%d,src3y=%d,src4x =%d,src4y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, src2x , src2y, src3x , src3y, src4x , src4y, dstx, dsty); - - EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss); + EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0); } } @@ -252,7 +244,7 @@ PARAM_TEST_CASE(SplitTestBase, MatType, int) cv::Mat dst2_roi; cv::Mat dst3_roi; cv::Mat dst4_roi; - //std::vector oclinfo; + //ocl dst mat for testing cv::ocl::oclMat gdst1_whole; cv::ocl::oclMat gdst2_whole; @@ -280,10 +272,6 @@ PARAM_TEST_CASE(SplitTestBase, MatType, int) dst3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false); dst4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false); - //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE); - //CV_Assert(devnums > 0); - ////if you want to use undefault device, set it here - ////setDevice(oclinfo[0]); } void random_roi() @@ -356,28 +344,17 @@ TEST_P(Split, Accuracy) cv::split(mat_roi, dev_dst); cv::ocl::split(gmat, dev_gdst); - cv::Mat cpu_dst1; - cv::Mat cpu_dst2; - cv::Mat cpu_dst3; - cv::Mat cpu_dst4; - gdst1_whole.download(cpu_dst1); - gdst2_whole.download(cpu_dst2); - gdst3_whole.download(cpu_dst3); - gdst4_whole.download(cpu_dst4); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,dst1x =%d,dsty=%d,dst2x =%d,dst2y=%d,dst3x =%d,dst3y=%d,dst4x =%d,dst4y=%d,srcx=%d,srcy=%d", roicols, roirows, dst1x , dst1y, dst2x , dst2y, dst3x , dst3y, dst4x , dst4y, srcx, srcy); - if(channels >= 1) - EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.0, sss); + EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), 0.0); if(channels >= 2) - EXPECT_MAT_NEAR(dst2, cpu_dst2, 0.0, sss); + EXPECT_MAT_NEAR(dst2, Mat(gdst2_whole), 0.0); if(channels >= 3) - EXPECT_MAT_NEAR(dst3, cpu_dst3, 0.0, sss); + EXPECT_MAT_NEAR(dst3, Mat(gdst3_whole), 0.0); if(channels >= 4) - EXPECT_MAT_NEAR(dst4, cpu_dst4, 0.0, sss); + EXPECT_MAT_NEAR(dst4, Mat(gdst4_whole), 0.0); } } diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp index 947cb379d..9eb48a0ef 100644 --- a/modules/ocl/test/utility.hpp +++ b/modules/ocl/test/utility.hpp @@ -78,20 +78,20 @@ double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2); EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \ } -/*#define EXPECT_MAT_NEAR(mat1, mat2, eps) \ +#define EXPECT_MAT_NEAR(mat1, mat2, eps) \ { \ ASSERT_EQ(mat1.type(), mat2.type()); \ ASSERT_EQ(mat1.size(), mat2.size()); \ EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \ -}*/ - +} +/* #define EXPECT_MAT_NEAR(mat1, mat2, eps,s) \ { \ ASSERT_EQ(mat1.type(), mat2.type()); \ ASSERT_EQ(mat1.size(), mat2.size()); \ EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps)<init(resourcesubdir); \ ::testing::InitGoogleTest(&argc, argv); \ + cvtest::printVersionInfo();\ return RUN_ALL_TESTS(); \ } diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp index 93e5d2865..cf549ee48 100644 --- a/modules/ts/include/opencv2/ts/ts_perf.hpp +++ b/modules/ts/include/opencv2/ts/ts_perf.hpp @@ -474,6 +474,7 @@ int main(int argc, char **argv)\ ::perf::Regression::Init(#testsuitname);\ ::perf::TestBase::Init(argc, argv);\ ::testing::InitGoogleTest(&argc, argv);\ + cvtest::printVersionInfo();\ return RUN_ALL_TESTS();\ } diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 4b48cd9ed..0f3751e52 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -2937,4 +2937,30 @@ MatComparator::operator()(const char* expr1, const char* expr2, << "'" << expr2 << "': " << MatPart(m2part, border > 0 ? &loc : 0) << ".\n"; } +void printVersionInfo(bool useStdOut) +{ + ::testing::Test::RecordProperty("CV_VERSION", CV_VERSION); + if(useStdOut) std::cout << "OpenCV version: " << CV_VERSION << std::endl; + + std::string buildInfo( cv::getBuildInformation() ); + + size_t pos1 = buildInfo.find("Version control"); + size_t pos2 = buildInfo.find("\n", pos1);\ + if(pos1 != std::string::npos && pos2 != std::string::npos) + { + std::string ver( buildInfo.substr(pos1, pos2-pos1) ); + ::testing::Test::RecordProperty("Version_control", ver); + if(useStdOut) std::cout << ver << std::endl; + } + + pos1 = buildInfo.find("inner version"); + pos2 = buildInfo.find("\n", pos1);\ + if(pos1 != std::string::npos && pos2 != std::string::npos) + { + std::string ver( buildInfo.substr(pos1, pos2-pos1) ); + ::testing::Test::RecordProperty("inner_version", ver); + if(useStdOut) std::cout << ver << std::endl; + } +} + } diff --git a/samples/android/CMakeLists.txt b/samples/android/CMakeLists.txt index 9d7b0cbf0..c0b626d9b 100644 --- a/samples/android/CMakeLists.txt +++ b/samples/android/CMakeLists.txt @@ -10,16 +10,13 @@ add_subdirectory(15-puzzle) add_subdirectory(face-detection) add_subdirectory(image-manipulations) add_subdirectory(color-blob-detection) - -if (ANDROID_NATIVE_API_LEVEL GREATER 8) - add_subdirectory(native-activity) -endif() - add_subdirectory(tutorial-1-camerapreview) add_subdirectory(tutorial-2-mixedprocessing) add_subdirectory(tutorial-3-cameracontrol) -#hello-android sample +add_subdirectory(native-activity) + +# hello-android sample if(HAVE_opencv_highgui) ocv_include_modules_recurse(opencv_highgui opencv_core) add_executable(hello-android hello-android/main.cpp) diff --git a/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java b/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java index d8e437533..1a9302b1d 100644 --- a/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java +++ b/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java @@ -6,7 +6,6 @@ import org.opencv.android.OpenCVLoader; import android.app.Activity; import android.content.Intent; -import android.os.Bundle; import android.util.Log; public class CvNativeActivity extends Activity { diff --git a/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml b/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml index 52b9dfaf6..40f5f4932 100644 --- a/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml +++ b/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml @@ -1,4 +1,4 @@ - - + diff --git a/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java b/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java index 04c163ff8..7ba2d9f96 100644 --- a/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java +++ b/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java @@ -6,17 +6,16 @@ import java.util.List; import org.opencv.android.JavaCameraView; import android.content.Context; -import android.graphics.Bitmap; -import android.graphics.BitmapFactory; import android.hardware.Camera; import android.hardware.Camera.PictureCallback; import android.hardware.Camera.Size; import android.util.AttributeSet; import android.util.Log; -public class Tutorial3View extends JavaCameraView { +public class Tutorial3View extends JavaCameraView implements PictureCallback { private static final String TAG = "Sample::Tutorial3View"; + private String mPictureFileName; public Tutorial3View(Context context, AttributeSet attrs) { super(context, attrs); @@ -56,26 +55,33 @@ public class Tutorial3View extends JavaCameraView { } public void takePicture(final String fileName) { - Log.i(TAG, "Tacking picture"); - PictureCallback callback = new PictureCallback() { + Log.i(TAG, "Taking picture"); + this.mPictureFileName = fileName; + // Postview and jpeg are sent in the same buffers if the queue is not empty when performing a capture. + // Clear up buffers to avoid mCamera.takePicture to be stuck because of a memory issue + mCamera.setPreviewCallback(null); - private String mPictureFileName = fileName; - - @Override - public void onPictureTaken(byte[] data, Camera camera) { - Log.i(TAG, "Saving a bitmap to file"); - Bitmap picture = BitmapFactory.decodeByteArray(data, 0, data.length); - try { - FileOutputStream out = new FileOutputStream(mPictureFileName); - picture.compress(Bitmap.CompressFormat.JPEG, 90, out); - picture.recycle(); - mCamera.startPreview(); - } catch (Exception e) { - e.printStackTrace(); - } - } - }; - - mCamera.takePicture(null, null, callback); + // PictureCallback is implemented by the current class + mCamera.takePicture(null, null, this); } -} + + @Override + public void onPictureTaken(byte[] data, Camera camera) { + Log.i(TAG, "Saving a bitmap to file"); + // The camera preview was automatically stopped. Start it again. + mCamera.startPreview(); + mCamera.setPreviewCallback(this); + + // Write the image in a file (in jpeg format) + try { + FileOutputStream fos = new FileOutputStream(mPictureFileName); + + fos.write(data); + fos.close(); + + } catch (java.io.IOException e) { + Log.e("PictureDemo", "Exception in photoCallback", e); + } + + } +} \ No newline at end of file