diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fff2d614..2fb1cf7f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,6 +156,7 @@ OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" ON OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" ON IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_DIRECTX "Include DirectX support" ON IF WIN32 ) +OCV_OPTION(WITH_INTELPERC "Include Intel Perceptual Computing support" OFF IF WIN32 ) # OpenCV build components @@ -207,7 +208,8 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND ARM) ) +OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM ) +OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors" OFF ) OCV_OPTION(ENABLE_WINRT_MODE "Build with Windows Runtime support" OFF IF WIN32 ) @@ -226,6 +228,15 @@ include(cmake/OpenCVVersion.cmake) # Save libs and executables in the same place set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" ) +if (ANDROID) + if (ANDROID_ABI MATCHES "NEON") + set(ENABLE_NEON ON) + endif() + if (ANDROID_ABI MATCHES "VFPV3") + set(ENABLE_VFPV3 ON) + endif() +endif() + if(ANDROID OR WIN32) set(OPENCV_DOC_INSTALL_PATH doc) elseif(INSTALL_TO_MANGLED_PATHS) @@ -820,6 +831,11 @@ if(DEFINED WITH_XINE) status(" Xine:" HAVE_XINE THEN "YES (ver ${ALIASOF_libxine_VERSION})" ELSE NO) endif(DEFINED WITH_XINE) +if(DEFINED WITH_INTELPERC) + status(" Intel PerC:" HAVE_INTELPERC THEN "YES" ELSE NO) +endif(DEFINED WITH_INTELPERC) + + # ========================== Other third-party libraries ========================== status("") status(" Other third-party libraries:") diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index fd36a45c6..59b19b601 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -124,6 +124,12 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSE2) add_extra_compiler_option(-msse2) endif() + if (ENABLE_NEON) + add_extra_compiler_option("-mfpu=neon") + endif() + if (ENABLE_VFPV3 AND NOT ENABLE_NEON) + add_extra_compiler_option("-mfpu=vfpv3") + endif() # SSE3 and further should be disabled under MingW because it generates compiler errors if(NOT MINGW) diff --git a/cmake/OpenCVFindIntelPerCSDK.cmake b/cmake/OpenCVFindIntelPerCSDK.cmake new file mode 100644 index 000000000..724310560 --- /dev/null +++ b/cmake/OpenCVFindIntelPerCSDK.cmake @@ -0,0 +1,20 @@ +# Main variables: +# INTELPERC_LIBRARIES and INTELPERC_INCLUDE to link Intel Perceptial Computing SDK modules +# HAVE_INTELPERC for conditional compilation OpenCV with/without Intel Perceptial Computing SDK + +if(X86_64) + find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers") + find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/x64" DOC "Path to Intel Perceptual Computing SDK interface libraries") +else() + find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers") + find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Path to Intel Perceptual Computing SDK interface libraries") +endif() + +if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES) + set(HAVE_INTELPERC TRUE) +else() + set(HAVE_INTELPERC FALSE) + message(WARNING "Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries.") +endif() #if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES) + +mark_as_advanced(FORCE INTELPERC_LIBRARIES INTELPERC_INCLUDE_DIR) \ No newline at end of file diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index 807f4fbbf..93cce2b7a 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -277,3 +277,8 @@ if (NOT IOS) set(HAVE_QTKIT YES) endif() endif() + +# --- Intel Perceptual Computing SDK --- +if(WITH_INTELPERC) + include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake") +endif(WITH_INTELPERC) diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in index 554b91cef..e1beaada7 100644 --- a/cmake/templates/cvconfig.h.in +++ b/cmake/templates/cvconfig.h.in @@ -88,6 +88,9 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_INTTYPES_H 1 +/* Intel Perceptual Computing SDK library */ +#cmakedefine HAVE_INTELPERC + /* Intel Integrated Performance Primitives */ #cmakedefine HAVE_IPP diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst index 0b2253ace..87f6d9d4d 100644 --- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst +++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst @@ -106,8 +106,8 @@ Enable hardware optimizations ----------------------------- Depending on target platform architecture different instruction sets can be used. By default -compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON`` -to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using +compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DENABLE_VFPV3=ON`` +to cmake command line to enable code generation for VFPv3 and ``-DENABLE_NEON=ON`` for using NEON SIMD extensions. TBB is supported on multi core ARM SoCs also. diff --git a/doc/user_guide/ug_intelperc.rst b/doc/user_guide/ug_intelperc.rst new file mode 100644 index 000000000..bae5f7014 --- /dev/null +++ b/doc/user_guide/ug_intelperc.rst @@ -0,0 +1,79 @@ +******* +HighGUI +******* + +.. highlight:: cpp + +Using Creative Senz3D and other Intel Perceptual Computing SDK compatible depth sensors +======================================================================================= + +Depth sensors compatible with Intel Perceptual Computing SDK are supported through ``VideoCapture`` class. Depth map, RGB image and some other formats of output can be retrieved by using familiar interface of ``VideoCapture``. + +In order to use depth sensor with OpenCV you should do the following preliminary steps: + +#. + Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual). + +#. + Configure OpenCV with Intel Perceptual Computing SDK support by setting ``WITH_INTELPERC`` flag in CMake. If Intel Perceptual Computing SDK is found in install folders OpenCV will be built with Intel Perceptual Computing SDK library (see a status ``INTELPERC`` in CMake log). If CMake process doesn't find Intel Perceptual Computing SDK installation folder automatically, the user should change corresponding CMake variables ``INTELPERC_LIB_DIR`` and ``INTELPERC_INCLUDE_DIR`` to the proper value. + +#. + Build OpenCV. + +VideoCapture can retrieve the following data: + +#. + data given from depth generator: + * ``CV_CAP_INTELPERC_DEPTH_MAP`` - each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. (CV_16UC1) + * ``CV_CAP_INTELPERC_UVDEPTH_MAP`` - each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. (CV_32FC2) + * ``CV_CAP_INTELPERC_IR_MAP`` - each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. (CV_16UC1) +#. + data given from RGB image generator: + * ``CV_CAP_INTELPERC_IMAGE`` - color image. (CV_8UC3) + +In order to get depth map from depth sensor use ``VideoCapture::operator >>``, e. g. :: + + VideoCapture capture( CV_CAP_INTELPERC ); + for(;;) + { + Mat depthMap; + capture >> depthMap; + + if( waitKey( 30 ) >= 0 ) + break; + } + +For getting several data maps use ``VideoCapture::grab`` and ``VideoCapture::retrieve``, e.g. :: + + VideoCapture capture(CV_CAP_INTELPERC); + for(;;) + { + Mat depthMap; + Mat image; + Mat irImage; + + capture.grab(); + + capture.retrieve( depthMap, CV_CAP_INTELPERC_DEPTH_MAP ); + capture.retrieve( image, CV_CAP_INTELPERC_IMAGE ); + capture.retrieve( irImage, CV_CAP_INTELPERC_IR_MAP); + + if( waitKey( 30 ) >= 0 ) + break; + } + +For setting and getting some property of sensor` data generators use ``VideoCapture::set`` and ``VideoCapture::get`` methods respectively, e.g. :: + + VideoCapture capture( CV_CAP_INTELPERC ); + capture.set( CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0 ); + cout << "FPS " << capture.get( CV_CAP_INTELPERC_DEPTH_GENERATOR+CV_CAP_PROP_FPS ) << endl; + +Since two types of sensor's data generators are supported (image generator and depth generator), there are two flags that should be used to set/get property of the needed generator: + +* CV_CAP_INTELPERC_IMAGE_GENERATOR -- a flag for access to the image generator properties. + +* CV_CAP_INTELPERC_DEPTH_GENERATOR -- a flag for access to the depth generator properties. This flag value is assumed by default if neither of the two possible values of the property is set. + +For more information please refer to the example of usage intelperc_capture.cpp_ in ``opencv/samples/cpp`` folder. + +.. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp \ No newline at end of file diff --git a/doc/user_guide/user_guide.rst b/doc/user_guide/user_guide.rst index de9edcb68..76cf756f8 100644 --- a/doc/user_guide/user_guide.rst +++ b/doc/user_guide/user_guide.rst @@ -9,3 +9,4 @@ OpenCV User Guide ug_features2d.rst ug_highgui.rst ug_traincascade.rst + ug_intelperc.rst diff --git a/modules/core/doc/operations_on_arrays.rst b/modules/core/doc/operations_on_arrays.rst index a894d0768..c936457af 100644 --- a/modules/core/doc/operations_on_arrays.rst +++ b/modules/core/doc/operations_on_arrays.rst @@ -903,7 +903,7 @@ So, the function chooses an operation mode depending on the flags and size of th * When ``DFT_COMPLEX_OUTPUT`` is set, the output is a complex matrix of the same size as input. - * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DCT_ROWS`` flag), each row of the output matrix looks like the first row of the matrix above. + * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DFT_ROWS`` flag), each row of the output matrix looks like the first row of the matrix above. * If the input array is complex and either ``DFT_INVERSE`` or ``DFT_REAL_OUTPUT`` are not set, the output is a complex array of the same size as input. The function performs a forward or inverse 1D or 2D transform of the whole input array or each row of the input array independently, depending on the flags ``DFT_INVERSE`` and ``DFT_ROWS``. diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index c39f11d4f..acac45c52 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2577,7 +2577,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags ) DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2]; - if( (flags & DFT_ROWS) || src.rows == 1 || + if( (flags & DCT_ROWS) || src.rows == 1 || (src.cols == 1 && (src.isContinuous() && dst.isContinuous()))) { stage = end_stage = 0; @@ -2597,7 +2597,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags ) { len = src.cols; count = src.rows; - if( len == 1 && !(flags & DFT_ROWS) ) + if( len == 1 && !(flags & DCT_ROWS) ) { len = src.rows; count = 1; diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 6f2580498..d4db98119 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2760,39 +2760,24 @@ void cv::transpose( InputArray _src, OutputArray _dst ) } +////////////////////////////////////// completeSymm ///////////////////////////////////////// + void cv::completeSymm( InputOutputArray _m, bool LtoR ) { Mat m = _m.getMat(); - CV_Assert( m.dims <= 2 ); + size_t step = m.step, esz = m.elemSize(); + CV_Assert( m.dims <= 2 && m.rows == m.cols ); - int i, j, nrows = m.rows, type = m.type(); - int j0 = 0, j1 = nrows; - CV_Assert( m.rows == m.cols ); + int rows = m.rows; + int j0 = 0, j1 = rows; - if( type == CV_32FC1 || type == CV_32SC1 ) + uchar* data = m.data; + for( int i = 0; i < rows; i++ ) { - int* data = (int*)m.data; - size_t step = m.step/sizeof(data[0]); - for( i = 0; i < nrows; i++ ) - { - if( !LtoR ) j1 = i; else j0 = i+1; - for( j = j0; j < j1; j++ ) - data[i*step + j] = data[j*step + i]; - } + if( !LtoR ) j1 = i; else j0 = i+1; + for( int j = j0; j < j1; j++ ) + memcpy(data + (i*step + j*esz), data + (j*step + i*esz), esz); } - else if( type == CV_64FC1 ) - { - double* data = (double*)m.data; - size_t step = m.step/sizeof(data[0]); - for( i = 0; i < nrows; i++ ) - { - if( !LtoR ) j1 = i; else j0 = i+1; - for( j = j0; j < j1; j++ ) - data[i*step + j] = data[j*step + i]; - } - } - else - CV_Error( CV_StsUnsupportedFormat, "" ); } diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt index 51ab0c3ef..ea8aefde5 100644 --- a/modules/highgui/CMakeLists.txt +++ b/modules/highgui/CMakeLists.txt @@ -222,6 +222,12 @@ elseif(HAVE_QTKIT) list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit") endif() +if(HAVE_INTELPERC) + list(APPEND highgui_srcs src/cap_intelperc.cpp) + ocv_include_directories(${INTELPERC_INCLUDE_DIR}) + list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARIES}) +endif(HAVE_INTELPERC) + if(IOS) add_definitions(-DHAVE_IOS=1) list(APPEND highgui_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm) diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp index cebf8fe22..eb4ee8c03 100644 --- a/modules/highgui/include/opencv2/highgui.hpp +++ b/modules/highgui/include/opencv2/highgui.hpp @@ -271,7 +271,8 @@ enum { CAP_ANY = 0, // autodetect CAP_XIAPI = 1100, // XIMEA Camera API CAP_AVFOUNDATION = 1200, // AVFoundation framework for iOS (OS X Lion will have the same API) CAP_GIGANETIX = 1300, // Smartek Giganetix GigEVisionSDK - CAP_MSMF = 1400 // Microsoft Media Foundation (via videoInput) + CAP_MSMF = 1400, // Microsoft Media Foundation (via videoInput) + CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK }; // generic properties (based on DC1394 properties) @@ -496,6 +497,26 @@ enum { CAP_PROP_GIGA_FRAME_OFFSET_X = 10001, CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006 }; +enum { CAP_PROP_INTELPERC_PROFILE_COUNT = 11001, + CAP_PROP_INTELPERC_PROFILE_IDX = 11002, + CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE = 11003, + CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE = 11004, + CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD = 11005, + CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ = 11006, + CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007 + }; + +// Intel PerC streams +enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29, + CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28, + CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR + }; + +enum { CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. + CAP_INTELPERC_UVDEPTH_MAP = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. + CAP_INTELPERC_IR_MAP = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. + CAP_INTELPERC_IMAGE = 3 + }; class CV_EXPORTS_W VideoCapture { diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h index 2ebea0d30..1a42e5804 100644 --- a/modules/highgui/include/opencv2/highgui/highgui_c.h +++ b/modules/highgui/include/opencv2/highgui/highgui_c.h @@ -313,7 +313,9 @@ enum CV_CAP_AVFOUNDATION = 1200, // AVFoundation framework for iOS (OS X Lion will have the same API) - CV_CAP_GIGANETIX = 1300 // Smartek Giganetix GigEVisionSDK + CV_CAP_GIGANETIX = 1300, // Smartek Giganetix GigEVisionSDK + + CV_CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK }; /* start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*) */ @@ -459,16 +461,29 @@ enum CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002, CV_CAP_PROP_IOS_DEVICE_FLASH = 9003, CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004, - CV_CAP_PROP_IOS_DEVICE_TORCH = 9005 + CV_CAP_PROP_IOS_DEVICE_TORCH = 9005, // Properties of cameras available through Smartek Giganetix Ethernet Vision interface /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */ - ,CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001, + CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001, CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002, CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003, CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004, CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005, - CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006 + CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006, + + CV_CAP_PROP_INTELPERC_PROFILE_COUNT = 11001, + CV_CAP_PROP_INTELPERC_PROFILE_IDX = 11002, + CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE = 11003, + CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE = 11004, + CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD = 11005, + CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ = 11006, + CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007, + + // Intel PerC streams + CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29, + CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28, + CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR }; enum @@ -549,6 +564,14 @@ enum CV_CAP_ANDROID_ANTIBANDING_OFF }; +enum +{ + CV_CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. + CV_CAP_INTELPERC_UVDEPTH_MAP = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. + CV_CAP_INTELPERC_IR_MAP = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. + CV_CAP_INTELPERC_IMAGE = 3 +}; + /* retrieve or set capture properties */ CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id ); CVAPI(int) cvSetCaptureProperty( CvCapture* capture, int property_id, double value ); diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp index 0f4e6afb8..105f92e8c 100644 --- a/modules/highgui/src/cap.cpp +++ b/modules/highgui/src/cap.cpp @@ -155,6 +155,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index) #endif #ifdef HAVE_GIGE_API CV_CAP_GIGANETIX, +#endif +#ifdef HAVE_INTELPERC + CV_CAP_INTELPERC, #endif -1 }; @@ -193,6 +196,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index) defined(HAVE_AVFOUNDATION) || \ defined(HAVE_ANDROID_NATIVE_CAMERA) || \ defined(HAVE_GIGE_API) || \ + defined(HAVE_INTELPERC) || \ (0) // local variable to memorize the captured device CvCapture *capture; @@ -342,6 +346,13 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index) break; // CV_CAP_GIGANETIX #endif +#ifdef HAVE_INTELPERC + case CV_CAP_INTELPERC: + capture = cvCreateCameraCapture_IntelPerC(index); + if (capture) + return capture; + break; // CV_CAP_INTEL_PERC +#endif } } diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp new file mode 100644 index 000000000..368f4fd2c --- /dev/null +++ b/modules/highgui/src/cap_intelperc.cpp @@ -0,0 +1,714 @@ +#include "precomp.hpp" + +#ifdef HAVE_INTELPERC + +#include "pxcsession.h" +#include "pxcsmartptr.h" +#include "pxccapture.h" + +class CvIntelPerCStreamBase +{ +protected: + struct FrameInternal + { + IplImage* retrieveFrame() + { + if (m_mat.empty()) + return NULL; + m_iplHeader = IplImage(m_mat); + return &m_iplHeader; + } + cv::Mat m_mat; + private: + IplImage m_iplHeader; + }; +public: + CvIntelPerCStreamBase() + : m_profileIdx(-1) + , m_frameIdx(0) + , m_timeStampStartNS(0) + { + } + virtual ~CvIntelPerCStreamBase() + { + } + + bool isValid() + { + return (m_device.IsValid() && m_stream.IsValid()); + } + bool grabFrame() + { + if (!m_stream.IsValid()) + return false; + if (-1 == m_profileIdx) + { + if (!setProperty(CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0)) + return false; + } + PXCSmartPtr pxcImage; PXCSmartSP sp; + if (PXC_STATUS_NO_ERROR > m_stream->ReadStreamAsync(&pxcImage, &sp)) + return false; + if (PXC_STATUS_NO_ERROR > sp->Synchronize()) + return false; + if (0 == m_timeStampStartNS) + m_timeStampStartNS = pxcImage->QueryTimeStamp(); + m_timeStamp = (double)((pxcImage->QueryTimeStamp() - m_timeStampStartNS) / 10000); + m_frameIdx++; + return prepareIplImage(pxcImage); + } + int getProfileIDX() const + { + return m_profileIdx; + } +public: + virtual bool initStream(PXCSession *session) = 0; + virtual double getProperty(int propIdx) + { + double ret = 0.0; + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_PROFILE_COUNT: + ret = (double)m_profiles.size(); + break; + case CV_CAP_PROP_FRAME_WIDTH : + if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size())) + ret = (double)m_profiles[m_profileIdx].imageInfo.width; + break; + case CV_CAP_PROP_FRAME_HEIGHT : + if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size())) + ret = (double)m_profiles[m_profileIdx].imageInfo.height; + break; + case CV_CAP_PROP_FPS : + if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size())) + { + ret = ((double)m_profiles[m_profileIdx].frameRateMin.numerator / (double)m_profiles[m_profileIdx].frameRateMin.denominator + + (double)m_profiles[m_profileIdx].frameRateMax.numerator / (double)m_profiles[m_profileIdx].frameRateMax.denominator) / 2.0; + } + break; + case CV_CAP_PROP_POS_FRAMES: + ret = (double)m_frameIdx; + break; + case CV_CAP_PROP_POS_MSEC: + ret = m_timeStamp; + break; + }; + return ret; + } + virtual bool setProperty(int propIdx, double propVal) + { + bool isSet = false; + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_PROFILE_IDX: + { + int propValInt = (int)propVal; + if ((0 <= propValInt) && (propValInt < m_profiles.size())) + { + if (m_profileIdx != propValInt) + { + m_profileIdx = propValInt; + if (m_stream.IsValid()) + m_stream->SetProfile(&m_profiles[m_profileIdx]); + m_frameIdx = 0; + m_timeStampStartNS = 0; + } + isSet = true; + } + } + break; + }; + return isSet; + } +protected: + PXCSmartPtr m_device; + bool initDevice(PXCSession *session) + { + if (NULL == session) + return false; + + pxcStatus sts = PXC_STATUS_NO_ERROR; + PXCSession::ImplDesc templat; + memset(&templat,0,sizeof(templat)); + templat.group = PXCSession::IMPL_GROUP_SENSOR; + templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE; + + for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++) + { + PXCSession::ImplDesc desc; + sts = session->QueryImpl(&templat, modidx, &desc); + if (PXC_STATUS_NO_ERROR > sts) + break; + + PXCSmartPtr capture; + sts = session->CreateImpl(&desc, &capture); + if (!capture.IsValid()) + continue; + + /* enumerate devices */ + for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++) + { + PXCSmartPtr device; + sts = capture->CreateDevice(devidx, &device); + if (PXC_STATUS_NO_ERROR <= sts) + { + m_device = device.ReleasePtr(); + return true; + } + } + } + return false; + } + + PXCSmartPtr m_stream; + void initStreamImpl(PXCImage::ImageType type) + { + if (!m_device.IsValid()) + return; + + pxcStatus sts = PXC_STATUS_NO_ERROR; + /* enumerate streams */ + for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++) + { + PXCCapture::Device::StreamInfo sinfo; + sts = m_device->QueryStream(streamidx, &sinfo); + if (PXC_STATUS_NO_ERROR > sts) + break; + if (PXCCapture::VideoStream::CUID != sinfo.cuid) + continue; + if (type != sinfo.imageType) + continue; + + sts = m_device->CreateStream(streamidx, &m_stream); + if (PXC_STATUS_NO_ERROR == sts) + break; + m_stream.ReleaseRef(); + } + } +protected: + std::vector m_profiles; + int m_profileIdx; + int m_frameIdx; + pxcU64 m_timeStampStartNS; + double m_timeStamp; + + virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& /*pinfo*/) + { + return true; + } + void enumProfiles() + { + m_profiles.clear(); + if (!m_stream.IsValid()) + return; + pxcStatus sts = PXC_STATUS_NO_ERROR; + for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++) + { + PXCCapture::VideoStream::ProfileInfo pinfo; + sts = m_stream->QueryProfile(profidx, &pinfo); + if (PXC_STATUS_NO_ERROR > sts) + break; + if (validProfile(pinfo)) + m_profiles.push_back(pinfo); + } + } + virtual bool prepareIplImage(PXCImage *pxcImage) = 0; +}; + +class CvIntelPerCStreamImage + : public CvIntelPerCStreamBase +{ +public: + CvIntelPerCStreamImage() + { + } + virtual ~CvIntelPerCStreamImage() + { + } + + virtual bool initStream(PXCSession *session) + { + if (!initDevice(session)) + return false; + initStreamImpl(PXCImage::IMAGE_TYPE_COLOR); + if (!m_stream.IsValid()) + return false; + enumProfiles(); + return true; + } + virtual double getProperty(int propIdx) + { + switch (propIdx) + { + case CV_CAP_PROP_BRIGHTNESS: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_CONTRAST: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_SATURATION: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_HUE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_GAMMA: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_SHARPNESS: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_GAIN: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_BACKLIGHT: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_EXPOSURE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, &fret)) + return (double)fret; + return 0.0; + } + break; + //Add image stream specific properties + } + return CvIntelPerCStreamBase::getProperty(propIdx); + } + virtual bool setProperty(int propIdx, double propVal) + { + switch (propIdx) + { + case CV_CAP_PROP_BRIGHTNESS: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, (float)propVal)); + } + break; + case CV_CAP_PROP_CONTRAST: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, (float)propVal)); + } + break; + case CV_CAP_PROP_SATURATION: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, (float)propVal)); + } + break; + case CV_CAP_PROP_HUE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, (float)propVal)); + } + break; + case CV_CAP_PROP_GAMMA: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, (float)propVal)); + } + break; + case CV_CAP_PROP_SHARPNESS: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, (float)propVal)); + } + break; + case CV_CAP_PROP_GAIN: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, (float)propVal)); + } + break; + case CV_CAP_PROP_BACKLIGHT: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, (float)propVal)); + } + break; + case CV_CAP_PROP_EXPOSURE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, (float)propVal)); + } + break; + //Add image stream specific properties + } + return CvIntelPerCStreamBase::setProperty(propIdx, propVal); + } +public: + IplImage* retrieveFrame() + { + return m_frame.retrieveFrame(); + } +protected: + FrameInternal m_frame; + bool prepareIplImage(PXCImage *pxcImage) + { + if (NULL == pxcImage) + return false; + PXCImage::ImageInfo info; + pxcImage->QueryInfo(&info); + + PXCImage::ImageData data; + pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data); + + if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type) + return false; + + cv::Mat temp(info.height, info.width, CV_8UC3, data.planes[0], data.pitches[0]); + temp.copyTo(m_frame.m_mat); + + pxcImage->ReleaseAccess(&data); + return true; + } +}; + +class CvIntelPerCStreamDepth + : public CvIntelPerCStreamBase +{ +public: + CvIntelPerCStreamDepth() + { + } + virtual ~CvIntelPerCStreamDepth() + { + } + + virtual bool initStream(PXCSession *session) + { + if (!initDevice(session)) + return false; + initStreamImpl(PXCImage::IMAGE_TYPE_DEPTH); + if (!m_stream.IsValid()) + return false; + enumProfiles(); + return true; + } + virtual double getProperty(int propIdx) + { + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ: + { + if (!m_device.IsValid()) + return 0.0f; + PXCPointF32 ptf; + if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf)) + return (double)ptf.x; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT: + { + if (!m_device.IsValid()) + return 0.0f; + PXCPointF32 ptf; + if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf)) + return (double)ptf.y; + return 0.0; + } + break; + //Add depth stream sepcific properties + } + return CvIntelPerCStreamBase::getProperty(propIdx); + } + virtual bool setProperty(int propIdx, double propVal) + { + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, (float)propVal)); + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, (float)propVal)); + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, (float)propVal)); + } + break; + //Add depth stream sepcific properties + } + return CvIntelPerCStreamBase::setProperty(propIdx, propVal); + } +public: + IplImage* retrieveDepthFrame() + { + return m_frameDepth.retrieveFrame(); + } + IplImage* retrieveIRFrame() + { + return m_frameIR.retrieveFrame(); + } + IplImage* retrieveUVFrame() + { + return m_frameUV.retrieveFrame(); + } +protected: + virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo) + { + return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format); + } +protected: + FrameInternal m_frameDepth; + FrameInternal m_frameIR; + FrameInternal m_frameUV; + + bool prepareIplImage(PXCImage *pxcImage) + { + if (NULL == pxcImage) + return false; + PXCImage::ImageInfo info; + pxcImage->QueryInfo(&info); + + PXCImage::ImageData data; + pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data); + + if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type) + return false; + + if (PXCImage::COLOR_FORMAT_DEPTH != data.format) + return false; + + { + cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[0], data.pitches[0]); + temp.copyTo(m_frameDepth.m_mat); + } + { + cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[1], data.pitches[1]); + temp.copyTo(m_frameIR.m_mat); + } + { + cv::Mat temp(info.height, info.width, CV_32FC2, data.planes[2], data.pitches[2]); + temp.copyTo(m_frameUV.m_mat); + } + + pxcImage->ReleaseAccess(&data); + return true; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class CvCapture_IntelPerC : public CvCapture +{ +public: + CvCapture_IntelPerC(int /*index*/) + : m_contextOpened(false) + { + pxcStatus sts = PXCSession_Create(&m_session); + if (PXC_STATUS_NO_ERROR > sts) + return; + m_contextOpened = m_imageStream.initStream(m_session); + m_contextOpened &= m_depthStream.initStream(m_session); + } + virtual ~CvCapture_IntelPerC(){} + + virtual double getProperty(int propIdx) + { + double propValue = 0; + int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK; + if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) + { + propValue = m_imageStream.getProperty(purePropIdx); + } + else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) + { + propValue = m_depthStream.getProperty(purePropIdx); + } + else + { + propValue = m_depthStream.getProperty(purePropIdx); + } + return propValue; + } + virtual bool setProperty(int propIdx, double propVal) + { + bool isSet = false; + int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK; + if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) + { + isSet = m_imageStream.setProperty(purePropIdx, propVal); + } + else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) + { + isSet = m_depthStream.setProperty(purePropIdx, propVal); + } + else + { + isSet = m_depthStream.setProperty(purePropIdx, propVal); + } + return isSet; + } + + bool grabFrame() + { + if (!isOpened()) + return false; + + bool isGrabbed = false; + if (m_depthStream.isValid()) + isGrabbed = m_depthStream.grabFrame(); + if ((m_imageStream.isValid()) && (-1 != m_imageStream.getProfileIDX())) + isGrabbed &= m_imageStream.grabFrame(); + + return isGrabbed; + } + + virtual IplImage* retrieveFrame(int outputType) + { + IplImage* image = 0; + switch (outputType) + { + case CV_CAP_INTELPERC_DEPTH_MAP: + image = m_depthStream.retrieveDepthFrame(); + break; + case CV_CAP_INTELPERC_UVDEPTH_MAP: + image = m_depthStream.retrieveUVFrame(); + break; + case CV_CAP_INTELPERC_IR_MAP: + image = m_depthStream.retrieveIRFrame(); + break; + case CV_CAP_INTELPERC_IMAGE: + image = m_imageStream.retrieveFrame(); + break; + } + CV_Assert(NULL != image); + return image; + } + + bool isOpened() const + { + return m_contextOpened; + } +protected: + bool m_contextOpened; + + PXCSmartPtr m_session; + CvIntelPerCStreamImage m_imageStream; + CvIntelPerCStreamDepth m_depthStream; +}; + + +CvCapture* cvCreateCameraCapture_IntelPerC(int index) +{ + CvCapture_IntelPerC* capture = new CvCapture_IntelPerC(index); + + if( capture->isOpened() ) + return capture; + + delete capture; + return 0; +} + + +#endif //HAVE_INTELPERC diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp index d225cb314..925cfdf40 100644 --- a/modules/highgui/src/precomp.hpp +++ b/modules/highgui/src/precomp.hpp @@ -128,6 +128,7 @@ CvCapture* cvCreateFileCapture_OpenNI( const char* filename ); CvCapture* cvCreateCameraCapture_Android( int index ); CvCapture* cvCreateCameraCapture_XIMEA( int index ); CvCapture* cvCreateCameraCapture_AVFoundation(int index); +CvCapture* cvCreateCameraCapture_IntelPerC(int index); CVAPI(int) cvHaveImageReader(const char* filename); diff --git a/modules/highgui/test/test_precomp.hpp b/modules/highgui/test/test_precomp.hpp index 8468e4618..826d16574 100644 --- a/modules/highgui/test/test_precomp.hpp +++ b/modules/highgui/test/test_precomp.hpp @@ -35,6 +35,7 @@ defined(HAVE_XIMEA) || \ defined(HAVE_AVFOUNDATION) || \ defined(HAVE_GIGE_API) || \ + defined(HAVE_INTELPERC) || \ (0) //defined(HAVE_ANDROID_NATIVE_CAMERA) || - enable after #1193 # define BUILD_WITH_CAMERA_SUPPORT 1 diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 875813068..30c42c032 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -3299,7 +3299,10 @@ public: if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) ) { bufxy = (*m1)(Rect(x, y, bcols, brows)); - bufa = (*m2)(Rect(x, y, bcols, brows)); + + const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x; + for( x1 = 0; x1 < bcols; x1++ ) + A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); } else if( planar_input ) { @@ -3680,7 +3683,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { for( x = 0; x < size.width; x++ ) { - int fxy = src2 ? src2[x] : 0; + int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0; dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; } @@ -3689,7 +3692,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { for( x = 0; x < size.width; x++ ) { - int fxy = src2 ? src2[x] : 0; + int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0; dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; } diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py index c41e6336c..23ed3a9a1 100755 --- a/modules/java/generator/gen_java.py +++ b/modules/java/generator/gen_java.py @@ -18,6 +18,8 @@ class_ignore_list = ( const_ignore_list = ( "CV_CAP_OPENNI", "CV_CAP_PROP_OPENNI_", + "CV_CAP_INTELPERC", + "CV_CAP_PROP_INTELPERC_" "WINDOW_AUTOSIZE", "CV_WND_PROP_", "CV_WINDOW_", diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index 02f77c224..405e48f02 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2013, Intel Corporation, all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -66,8 +67,8 @@ uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow) { #ifdef DISABLE_IMAGE2D - int x = clamp(convert_int_rte(coord.x), 0, cols - 1); - int y = clamp(convert_int_rte(coord.y), 0, rows - 1); + int x = clamp(round(coord.x), 0, cols - 1); + int y = clamp(round(coord.y), 0, rows - 1); return img[elemPerRow * y + x]; #else return (uchar)read_imageui(img, sam, coord).x; @@ -98,6 +99,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM #define CV_PI_F 3.14159265f #endif + // Use integral image to calculate haar wavelets. // N = 2 // for simple haar paatern @@ -114,10 +116,10 @@ float icvCalcHaarPatternSum_2( F d = 0; - int2 dx1 = convert_int2_rte(ratio * src[0]); - int2 dy1 = convert_int2_rte(ratio * src[1]); - int2 dx2 = convert_int2_rte(ratio * src[2]); - int2 dy2 = convert_int2_rte(ratio * src[3]); + int2 dx1 = convert_int2(round(ratio * src[0])); + int2 dy1 = convert_int2(round(ratio * src[1])); + int2 dx2 = convert_int2(round(ratio * src[2])); + int2 dy2 = convert_int2(round(ratio * src[3])); F t = 0; t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); @@ -136,106 +138,9 @@ float icvCalcHaarPatternSum_2( return (float)d; } -// N = 3 -float icvCalcHaarPatternSum_3( - IMAGE_INT32 sumTex, - __constant float4 *src, - int oldSize, - int newSize, - int y, int x, - int rows, int cols, int elemPerRow) -{ - - float ratio = (float)newSize / oldSize; - - F d = 0; - - int4 dx1 = convert_int4_rte(ratio * src[0]); - int4 dy1 = convert_int4_rte(ratio * src[1]); - int4 dx2 = convert_int4_rte(ratio * src[2]); - int4 dy2 = convert_int4_rte(ratio * src[3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); - d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); - d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow ); - d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z)); - - return (float)d; -} - -// N = 4 -float icvCalcHaarPatternSum_4( - IMAGE_INT32 sumTex, - __constant float4 *src, - int oldSize, - int newSize, - int y, int x, - int rows, int cols, int elemPerRow) -{ - - float ratio = (float)newSize / oldSize; - - F d = 0; - - int4 dx1 = convert_int4_rte(ratio * src[0]); - int4 dy1 = convert_int4_rte(ratio * src[1]); - int4 dx2 = convert_int4_rte(ratio * src[2]); - int4 dy2 = convert_int4_rte(ratio * src[3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); - d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); - d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow ); - d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow ); - d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w)); - - return (float)d; -} - //////////////////////////////////////////////////////////////////////// // Hessian -__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) }; -__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) }; -__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets. - __inline int calcSize(int octave, int layer) { /* Wavelet size at first layer of first octave. */ @@ -250,6 +155,24 @@ __inline int calcSize(int octave, int layer) return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } +// Calculate a derivative in an axis-aligned direction (x or y). The "plus1" +// boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area). +// So the final computation is plus1a + plus1b - 2 * minus2. The corners are +// labeled A, B, C, and D, with A being the top left, B being top right, C +// being bottom left, and D being bottom right. +F calcAxisAlignedDerivative( + int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale, + int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale, + int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale) +{ + F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D; + F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D; + F minus2 = minus2_A - minus2_B - minus2_C + minus2_D; + + return (plus1a / plus1a_scale - + 2.0f * minus2 / minus2_scale + + plus1b / plus1b_scale); +} //calculate targeted layer per-pixel determinant and trace with an integral image __kernel void icvCalcLayerDetAndTrace( @@ -264,7 +187,7 @@ __kernel void icvCalcLayerDetAndTrace( int c_octave, int c_layer_rows, int sumTex_step -) + ) { det_step /= sizeof(*det); trace_step /= sizeof(*trace); @@ -288,16 +211,103 @@ __kernel void icvCalcLayerDetAndTrace( if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); - const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); - const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); + int x = j << c_octave; + int y = i << c_octave; + + float ratio = (float)size / 9; + + // Precompute some commonly used values, which are used to offset + // texture coordinates in the integral image. + int r1 = round(ratio); + int r2 = round(ratio * 2.0f); + int r3 = round(ratio * 3.0f); + int r4 = round(ratio * 4.0f); + int r5 = round(ratio * 5.0f); + int r6 = round(ratio * 6.0f); + int r7 = round(ratio * 7.0f); + int r8 = round(ratio * 8.0f); + int r9 = round(ratio * 9.0f); + + // Calculate the approximated derivative in the x-direction + F d = 0; + { + // Some of the pixels needed to compute the derivative are + // repeated, so we only don't duplicate the fetch here. + int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step ); + + d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2), + t62, t67, t92, t97, (r9 - r6) * (r7 - r2), + t32, t37, t62, t67, (r6 - r3) * (r7 - r2)); + } + const float dx = (float)d; + + // Calculate the approximated derivative in the y-direction + d = 0; + { + // Some of the pixels needed to compute the derivative are + // repeated, so we only don't duplicate the fetch here. + int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step ); + int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step ); + int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step ); + int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step ); + int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step ); + int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step ); + int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step ); + int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step ); + + d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3), + t26, t29, t76, t79, (r7 - r2) * (r9 - r6), + t23, t26, t73, t76, (r7 - r2) * (r6 - r3)); + } + const float dy = (float)d; + + // Calculate the approximated derivative in the xy-direction + d = 0; + { + // There's no saving us here, we just have to get all of the pixels in + // separate fetches + F t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step ); + d += t / ((r4 - r1) * (r4 - r1)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step ); + d -= t / ((r8 - r5) * (r4 - r1)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step ); + d -= t / ((r4 - r1) * (r8 - r5)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step ); + d += t / ((r8 - r5) * (r8 - r5)); + } + const float dxy = (float)d; det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; } } - //////////////////////////////////////////////////////////////////////// // NONMAX @@ -309,10 +319,10 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro float d = 0; - int dx1 = convert_int_rte(ratio * c_DM[0]); - int dy1 = convert_int_rte(ratio * c_DM[1]); - int dx2 = convert_int_rte(ratio * c_DM[2]); - int dy2 = convert_int_rte(ratio * c_DM[3]); + int dx1 = round(ratio * c_DM[0]); + int dy1 = round(ratio * c_DM[1]); + int dx2 = round(ratio * c_DM[2]); + int dy2 = round(ratio * c_DM[3]); float t = 0; @@ -572,7 +582,7 @@ void icvFindMaximaInLayer( } // solve 3x3 linear system Ax=b for floating point input -inline bool solve3x3_float(volatile __local const float4 *A, volatile __local const float *b, volatile __local float *x) +inline bool solve3x3_float(const float4 *A, const float *b, float *x) { float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y) - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x) @@ -651,7 +661,7 @@ void icvInterpolateKeypoint( if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0) { - volatile __local float dD[3]; + float dD[3]; //dx dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]); @@ -660,7 +670,7 @@ void icvInterpolateKeypoint( //ds dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]); - volatile __local float4 H[3]; + float4 H[3]; //dxx H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; @@ -681,7 +691,7 @@ void icvInterpolateKeypoint( //dss H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; - volatile __local float x[3]; + float x[3]; if (solve3x3_float(H, dD, x)) { @@ -711,7 +721,7 @@ void icvInterpolateKeypoint( sampled in a circle of radius 6s using wavelets of size 4s. We ensure the gradient wavelet size is even to ensure the wavelet pattern is balanced and symmetric around its center */ - const int grad_wav_size = 2 * convert_int_rte(2.0f * s); + const int grad_wav_size = 2 * round(2.0f * s); // check when grad_wav_size is too big if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) @@ -737,9 +747,12 @@ void icvInterpolateKeypoint( //////////////////////////////////////////////////////////////////////// // Orientation -#define ORI_SEARCH_INC 5 -#define ORI_WIN 60 -#define ORI_SAMPLES 113 +#define ORI_WIN 60 +#define ORI_SAMPLES 113 + +// The distance between samples in the beginning of the the reduction +#define ORI_RESPONSE_REDUCTION_WIDTH 48 +#define ORI_RESPONSE_ARRAY_SIZE (ORI_RESPONSE_REDUCTION_WIDTH * 2) __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; @@ -833,12 +846,15 @@ void icvCalcOrientation( __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; - volatile __local float s_X[128]; - volatile __local float s_Y[128]; - volatile __local float s_angle[128]; + __local float s_X[ORI_SAMPLES]; + __local float s_Y[ORI_SAMPLES]; + __local float s_angle[ORI_SAMPLES]; - volatile __local float s_sumx[32 * 4]; - volatile __local float s_sumy[32 * 4]; + // Need to allocate enough to make the reduction work without accessing + // past the end of the array. + __local float s_sumx[ORI_RESPONSE_ARRAY_SIZE]; + __local float s_sumy[ORI_RESPONSE_ARRAY_SIZE]; + __local float s_mod[ORI_RESPONSE_ARRAY_SIZE]; /* The sampling intervals and wavelet sized for selecting an orientation and building the keypoint descriptor are defined relative to 's' */ @@ -849,28 +865,60 @@ void icvCalcOrientation( sampled in a circle of radius 6s using wavelets of size 4s. We ensure the gradient wavelet size is even to ensure the wavelet pattern is balanced and symmetric around its center */ - const int grad_wav_size = 2 * convert_int_rte(2.0f * s); + const int grad_wav_size = 2 * round(2.0f * s); // check when grad_wav_size is too big if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size) return; // Calc X, Y, angle and store it to shared memory - const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); + const int tid = get_local_id(0); + // Initialize values that are only used as part of the reduction later. + if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) { + s_mod[tid + ORI_LOCAL_SIZE] = 0.0f; + } - float X = 0.0f, Y = 0.0f, angle = 0.0f; + float ratio = (float)grad_wav_size / 4; - if (tid < ORI_SAMPLES) + int r2 = round(ratio * 2.0); + int r4 = round(ratio * 4.0); + for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE ) { + float X = 0.0f, Y = 0.0f, angle = 0.0f; const float margin = (float)(grad_wav_size - 1) / 2.0f; - const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin); - const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin); + const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin); + const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin); if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && - x >= 0 && x < (c_img_cols + 1) - grad_wav_size) + x >= 0 && x < (c_img_cols + 1) - grad_wav_size) { - X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); - Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); + + float apt = c_aptW[i]; + + // Compute the haar sum without fetching duplicate pixels. + float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step); + float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step); + float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step); + float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step); + float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step); + float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step); + float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step); + float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step); + + F t = t00 - t04 - t20 + t24; + X -= t / ((r2) * (r4)); + + t = t20 - t24 - t40 + t44; + X += t / ((r4 - r2) * (r4)); + + t = t00 - t02 - t40 + t42; + Y += t / ((r2) * (r4)); + + t = t02 - t04 - t42 + t44; + Y -= t / ((r4) * (r4 - r2)); + + X = apt*X; + Y = apt*Y; angle = atan2(Y, X); @@ -879,76 +927,61 @@ void icvCalcOrientation( angle *= 180.0f / CV_PI_F; } + + s_X[i] = X; + s_Y[i] = Y; + s_angle[i] = angle; } - s_X[tid] = X; - s_Y[tid] = Y; - s_angle[tid] = angle; barrier(CLK_LOCAL_MEM_FENCE); float bestx = 0, besty = 0, best_mod = 0; + float sumx = 0.0f, sumy = 0.0f; + const int dir = tid * ORI_SEARCH_INC; + #pragma unroll + for (int i = 0; i < ORI_SAMPLES; ++i) { + int angle = round(s_angle[i]); -#pragma unroll - for (int i = 0; i < 18; ++i) - { - const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC; + int d = abs(angle - dir); + if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) + { + sumx += s_X[i]; + sumy += s_Y[i]; + } + } + s_sumx[tid] = sumx; + s_sumy[tid] = sumy; + s_mod[tid] = sumx*sumx + sumy*sumy; + barrier(CLK_LOCAL_MEM_FENCE); - volatile float sumx = 0.0f, sumy = 0.0f; - int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx = s_X[get_local_id(0)]; - sumy = s_Y[get_local_id(0)]; - } - d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx += s_X[get_local_id(0) + 32]; - sumy += s_Y[get_local_id(0) + 32]; - } - d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx += s_X[get_local_id(0) + 64]; - sumy += s_Y[get_local_id(0) + 64]; - } - d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx += s_X[get_local_id(0) + 96]; - sumy += s_Y[get_local_id(0) + 96]; - } - reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0)); - reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0)); - - const float temp_mod = sumx * sumx + sumy * sumy; - if (temp_mod > best_mod) - { - best_mod = temp_mod; - bestx = sumx; - besty = sumy; + // This reduction searches for the longest wavelet response vector. The first + // step uses all of the work items in the workgroup to narrow the search + // down to the three candidates. It requires s_mod to have a few more + // elements alocated past the work-group size, which are pre-initialized to + // 0.0f above. + for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) { + if (tid < t) { + if (s_mod[tid] < s_mod[tid + t]) { + s_mod[tid] = s_mod[tid + t]; + s_sumx[tid] = s_sumx[tid + t]; + s_sumy[tid] = s_sumy[tid + t]; + } } barrier(CLK_LOCAL_MEM_FENCE); } - if (get_local_id(0) == 0) - { - s_X[get_local_id(1)] = bestx; - s_Y[get_local_id(1)] = besty; - s_angle[get_local_id(1)] = best_mod; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (get_local_id(1) == 0 && get_local_id(0) == 0) + // Do the final reduction and write out the result. + if (tid == 0) { int bestIdx = 0; - if (s_angle[1] > s_angle[bestIdx]) + // The loop above narrowed the search of the longest vector to three + // possibilities. Pick the best here. + if (s_mod[1] > s_mod[bestIdx]) bestIdx = 1; - if (s_angle[2] > s_angle[bestIdx]) + if (s_mod[2] > s_mod[bestIdx]) bestIdx = 2; - if (s_angle[3] > s_angle[bestIdx]) - bestIdx = 3; - float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]); + float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]); if (kp_dir < 0) kp_dir += 2.0f * CV_PI_F; kp_dir *= 180.0f / CV_PI_F; @@ -961,7 +994,6 @@ void icvCalcOrientation( } } - __kernel void icvSetUpright( __global float * keypoints, @@ -1035,8 +1067,8 @@ inline float linearFilter( float out = 0.0f; - const int x1 = convert_int_rtn(x); - const int y1 = convert_int_rtn(y); + const int x1 = round(x); + const int y1 = round(y); const int x2 = x1 + 1; const int y2 = y1 + 1; diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index 5ade5e517..8fd717c6c 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -46,6 +46,7 @@ #ifdef HAVE_OPENCV_OCL #include +#include #include "opencl_kernels.hpp" using namespace cv; @@ -57,18 +58,25 @@ namespace cv { namespace ocl { + // The number of degrees between orientation samples in calcOrientation + const static int ORI_SEARCH_INC = 5; + // The local size of the calcOrientation kernel + const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC); + static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth) { - char optBuf [100] = {0}; - char * optBufPtr = optBuf; + std::stringstream optsStr; + optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " "; + optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " "; cl_kernel kernel; - kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr); + kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str()); size_t wave_size = queryWaveFrontSize(kernel); CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS); - sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast(wave_size)); - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr); + optsStr << "-D WAVE_SIZE=" << wave_size; + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str()); } + } } @@ -601,8 +609,8 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step)); - size_t localThreads[3] = {32, 4, 1}; - size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1}; + size_t localThreads[3] = {ORI_LOCAL_SIZE, 1, 1}; + size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1}; openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1); } diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst index 33f1b2796..6fbc19a71 100644 --- a/modules/ocl/doc/image_filtering.rst +++ b/modules/ocl/doc/image_filtering.rst @@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU ---------------------------------------- Creates a separable linear filter engine. -.. ocv:function:: Ptr ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT) +.. ocv:function:: Ptr ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param srcType: Source array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported. @@ -303,6 +303,8 @@ Creates a separable linear filter engine. :param bordertype: Pixel extrapolation method. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter` @@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU ------------------------------ Creates a filter engine for the generalized Sobel operator. -.. ocv:function:: Ptr ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ) +.. ocv:function:: Ptr ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param srcType: Source image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported. @@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator. :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter` @@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU --------------------------------- Creates a Gaussian filter engine. -.. ocv:function:: Ptr ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT) +.. ocv:function:: Ptr ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param type: Source and destination image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported. @@ -417,6 +421,8 @@ Creates a Gaussian filter engine. :param bordertype: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter` ocl::GaussianBlur diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp index 542dbeb0b..bc838924a 100644 --- a/modules/ocl/include/opencv2/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl.hpp @@ -695,17 +695,17 @@ namespace cv //! returns the separable linear filter engine CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, - const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT); + const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1)); //! returns the separable filter engine with the specified filters CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr &rowFilter, const Ptr &columnFilter); //! returns the Gaussian filter engine - CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT); + CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1)); //! returns filter engine for the generalized Sobel operator - CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ); + CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ); //! applies Laplacian operator to the image // supports only ksize = 1 and ksize = 3 @@ -1439,8 +1439,10 @@ namespace cv oclMat Dx_; oclMat Dy_; oclMat eig_; + oclMat eig_minmax_; oclMat minMaxbuf_; oclMat tmpCorners_; + oclMat counter_; }; inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_, diff --git a/modules/ocl/src/color.cpp b/modules/ocl/src/color.cpp index f71081d78..408ba4cce 100644 --- a/modules/ocl/src/color.cpp +++ b/modules/ocl/src/color.cpp @@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: { int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + int pixels_per_work_item = 1; - String build_options = format("-D DEPTH_%d", src.depth()); + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + if ((src.cols % 4 == 0) && (src.depth() == CV_8U)) + pixels_per_work_item = 4; + else if (src.cols % 2 == 0) + pixels_per_work_item = 2; + else + pixels_per_work_item = 1; + } + + String build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item); if (!additionalOptions.empty()) build_options = build_options + additionalOptions; @@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: if (!data2.empty()) args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data )); + size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), + const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat()) +{ + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data1.empty()) + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data1.data )); + if (!data2.empty()) + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data )); + + size_t gt[3] = { dst.cols, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data )); + size_t gt[3] = { dst.cols, dst.rows, 1 }; #ifdef ANDROID size_t lt[3] = { 16, 10, 1 }; @@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) { - String build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels()); + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + int pixels_per_work_item = 1; + + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + if ((src.cols % 4 == 0) && (src.depth() == CV_8U)) + pixels_per_work_item = 4; + else if (src.cols % 2 == 0) + pixels_per_work_item = 2; + else + pixels_per_work_item = 1; + } + + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data )); + + size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + String build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); if (!additionalOptions.empty()) build_options = build_options + additionalOptions; @@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -119,10 +238,13 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); } -static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) +static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) { - String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(), - dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER"); + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); @@ -136,6 +258,36 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset )); + if (!data.empty()) + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data )); + + size_t gt[3] = { dst.cols, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) +{ + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", + src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER"); + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset )); + size_t gt[3] = { dst.cols, dst.rows, 1 }; #ifdef ANDROID size_t lt[3] = { 16, 10, 1 }; @@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName) { - String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d", - src.depth(), greenbits, dst.channels()); + String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d", + src.depth(), greenbits, dst.channels(), bidx); int src_offset = src.offset >> 1, src_step = src.step >> 1; int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1(); @@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName) { - String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d", - src.depth(), greenbits, src.channels()); + String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d", + src.depth(), greenbits, src.channels(), bidx); int src_offset = (int)src.offset, src_step = (int)src.step; int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1; @@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) CV_Assert(scn == 1); dcn = code == COLOR_GRAY2BGRA ? 4 : 3; dst.create(sz, CV_MAKETYPE(depth, dcn)); - toRGB_caller(src, dst, 0, "Gray2RGB"); + fromGray_caller(src, dst, 0, "Gray2RGB"); break; } case COLOR_BGR2YUV: case COLOR_RGB2YUV: @@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) Size dstSz(sz.width, sz.height * 2 / 3); dst.create(dstSz, CV_MAKETYPE(depth, dcn)); - toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12"); + toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12"); break; } case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb: @@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) initialized = true; } - fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180); + toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180); return; } - fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f))); + toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f))); break; } case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: @@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) dst.create(sz, CV_MAKETYPE(depth, dcn)); std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB"; - toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange)); + fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange)); break; } case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA: diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 8832b305d..b6e1fff4e 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -741,6 +741,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke f->apply(src, dst); } +const int optimizedSepFilterLocalSize = 16; +static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst, + const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT) +{ + size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1}; + size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1}; + + unsigned int src_pitch = src.step; + unsigned int dst_pitch = dst.step; + + int src_offset_x = (src.offset % src.step) / src.elemSize(); + int src_offset_y = src.offset / src.step; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y )); + + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows )); + + String option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1], + row_kernel.rows / 2, col_kernel.rows / 2 ); + + option += " -D KERNEL_MATRIX_X="; + for(int i=0; i( &row_kernel.at(i) ) ); + option += "0x0"; + + option += " -D KERNEL_MATRIX_Y="; + for(int i=0; i( &col_kernel.at(i) ) ); + option += "0x0"; + + switch(src.type()) + { + case CV_8UC1: + option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float"; + break; + case CV_32FC1: + option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float"; + break; + case CV_8UC2: + option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2"; + break; + case CV_32FC2: + option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2"; + break; + case CV_8UC3: + option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3"; + break; + case CV_32FC3: + option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3"; + break; + case CV_8UC4: + option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4"; + break; + case CV_32FC4: + option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4"; + break; + default: + CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); + break; + } + switch(dst.type()) + { + case CV_8UC1: + option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat"; + break; + case CV_8UC2: + option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat"; + break; + case CV_8UC3: + option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat"; + break; + case CV_8UC4: + option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat"; + break; + case CV_32FC1: + option += " -D DSTTYPE=float -D CONVERT_DSTTYPE="; + break; + case CV_32FC2: + option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE="; + break; + case CV_32FC3: + option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE="; + break; + case CV_32FC4: + option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE="; + break; + default: + CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); + break; + } + switch(bordertype) + { + case cv::BORDER_CONSTANT: + option += " -D BORDER_CONSTANT"; + break; + case cv::BORDER_REPLICATE: + option += " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: + option += " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: + option += " -D BORDER_REFLECT_101"; + break; + case cv::BORDER_WRAP: + option += " -D BORDER_WRAP"; + break; + default: + CV_Error(CV_StsBadFlag, "BORDER type is not supported!"); + break; + } + + openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args, + -1, -1, option.c_str() ); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // SeparableFilter @@ -790,6 +919,35 @@ Ptr cv::ocl::createSeparableFilter_GPU(const Ptr(rowFilter, columnFilter); } +namespace +{ +class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU +{ +public: + SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype ) + { + bordertype = btype; + rowKernel = rowKernel_; + columnKernel = columnKernel_; + } + + virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) + { + normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size()); + + oclMat srcROI = src(roi); + oclMat dstROI = dst(roi); + + sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype); + } + + Mat rowKernel; + Mat columnKernel; + int bordertype; +}; +} + + static void GPUFilterBox(const oclMat &src, oclMat &dst, Size &ksize, const Point anchor, const int borderType) { @@ -1243,17 +1401,32 @@ Ptr cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in } Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, - const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype) + const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize ) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType); int bdepth = std::max(std::max(sdepth, ddepth), CV_32F); int bufType = CV_MAKETYPE(bdepth, cn); + Context* clCxt = Context::getContext(); - Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); - Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); + //if image size is non-degenerate and large enough + //and if filter support is reasonable to satisfy larger local memory requirements, + //then we can use single pass routine to avoid extra runtime calls overhead + if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && + rowKernel.rows <= 21 && columnKernel.rows <= 21 && + (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 && + imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) && + imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) ) + { + return Ptr(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype)); + } + else + { + Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); + Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); - return createSeparableFilter_GPU(rowFilter, columnFilter); + return createSeparableFilter_GPU(rowFilter, columnFilter); + } } void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) @@ -1277,16 +1450,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); - Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype); + Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size()); f->apply(src, dst); } -Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize ) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); return createSeparableLinearFilter_GPU(srcType, dstType, - kx, ky, Point(-1, -1), 0, borderType); + kx, ky, Point(-1, -1), 0, borderType, imgSize); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1356,7 +1529,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d //////////////////////////////////////////////////////////////////////////////////////////////////// // Gaussian Filter -Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype) +Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize) { int depth = CV_MAT_DEPTH(type); @@ -1383,7 +1556,7 @@ Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do else ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F)); - return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype); + return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize); } void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype) @@ -1419,7 +1592,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si dst.create(src.size(), src.type()); - Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype); + Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size()); f->apply(src, dst); } diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp index b07286553..09cd2a192 100644 --- a/modules/ocl/src/gftt.cpp +++ b/modules/ocl/src/gftt.cpp @@ -48,154 +48,142 @@ using namespace cv; using namespace cv::ocl; +// currently sort procedure on the host is more efficient static bool use_cpu_sorter = true; -namespace +// compact structure for corners +struct DefCorner { -enum SortMethod + float eig; //eigenvalue of corner + short x; //x coordinate of corner point + short y; //y coordinate of corner point +} ; + +// compare procedure for corner +//it is used for sort on the host side +struct DefCornerCompare { - CPU_STL, - BITONIC, - SELECTION -}; - -const int GROUP_SIZE = 256; - -template -struct Sorter -{ - //typedef EigType; -}; - -//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed. -template<> -struct Sorter -{ - typedef oclMat EigType; - static cv::Mutex cs; - static Mat mat_eig; - - //prototype - static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2) + bool operator()(const DefCorner a, const DefCorner b) const { - float v1 = mat_eig.at(cvRound(pt1.s[1]), cvRound(pt1.s[0])); - float v2 = mat_eig.at(cvRound(pt2.s[1]), cvRound(pt2.s[0])); - return v1 > v2; - } - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) - { - cv::AutoLock lock(cs); - //temporarily use STL's sort function - Mat mat_corners = corners; - mat_eig = eig_tex; - std::sort(mat_corners.begin(), mat_corners.begin() + count, clfloat2Gt); - corners = mat_corners; + return a.eig > b.eig; } }; -cv::Mutex Sorter::cs; -cv::Mat Sorter::mat_eig; -template<> -struct Sorter +// sort corner point using opencl bitonicosrt implementation +static void sortCorners_caller(oclMat& corners, const int count) { - typedef TextureCL EigType; + Context * cxt = Context::getContext(); + int GS = count/2; + int LS = min(255,GS); + size_t globalThreads[3] = {GS, 1, 1}; + size_t localThreads[3] = {LS, 1, 1}; - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) + // 2^numStages should be equal to count or the output is invalid + int numStages = 0; + for(int i = count; i > 1; i >>= 1) { - Context * cxt = Context::getContext(); - size_t globalThreads[3] = {count / 2, 1, 1}; - size_t localThreads[3] = {GROUP_SIZE, 1, 1}; - - // 2^numStages should be equal to count or the output is invalid - int numStages = 0; - for(int i = count; i > 1; i >>= 1) + ++numStages; + } + const int argc = 4; + std::vector< std::pair > args(argc); + std::string kernelname = "sortCorners_bitonicSort"; + args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data); + args[1] = std::make_pair(sizeof(cl_int), (void *)&count); + for(int stage = 0; stage < numStages; ++stage) + { + args[2] = std::make_pair(sizeof(cl_int), (void *)&stage); + for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) { - ++numStages; - } - const int argc = 5; - std::vector< std::pair > args(argc); - String kernelname = "sortCorners_bitonicSort"; - args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex); - args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data); - args[2] = std::make_pair(sizeof(cl_int), (void *)&count); - for(int stage = 0; stage < numStages; ++stage) - { - args[3] = std::make_pair(sizeof(cl_int), (void *)&stage); - for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) - { - args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - } + args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); + openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); } } -}; +} -template<> -struct Sorter -{ - typedef TextureCL EigType; - - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) - { - Context * cxt = Context::getContext(); - - size_t globalThreads[3] = {count, 1, 1}; - size_t localThreads[3] = {GROUP_SIZE, 1, 1}; - - std::vector< std::pair > args; - //local - String kernelname = "sortCorners_selectionSortLocal"; - int lds_size = GROUP_SIZE * sizeof(cl_float2); - args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) ); - args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) ); - args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) ); - args.push_back( std::make_pair( lds_size, (void*)NULL) ); - - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - - //final - kernelname = "sortCorners_selectionSortFinal"; - args.pop_back(); - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - } -}; - -int findCorners_caller( - const TextureCL& eig, - const float threshold, - const oclMat& mask, - oclMat& corners, - const int max_count) +// find corners on matrix and put it into array +static void findCorners_caller( + const oclMat& eig_mat, //input matrix worth eigenvalues + oclMat& eigMinMax, //input with min and max values of eigenvalues + const float qualityLevel, + const oclMat& mask, + oclMat& corners, //output array with detected corners + oclMat& counter) //output value with number of detected corners, have to be 0 before call { + String opt; std::vector k; Context * cxt = Context::getContext(); std::vector< std::pair > args; - String kernelname = "findCorners"; const int mask_strip = mask.step / mask.elemSize1(); - oclMat g_counter(1, 1, CV_32SC1); - g_counter.setTo(0); + args.push_back(std::make_pair( sizeof(cl_mem), (void*)&(eig_mat.data))); - args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eig )); + int src_pitch = (int)eig_mat.step; + args.push_back(std::make_pair( sizeof(cl_int), (void*)&src_pitch )); args.push_back(std::make_pair( sizeof(cl_mem), (void*)&mask.data )); args.push_back(std::make_pair( sizeof(cl_mem), (void*)&corners.data )); args.push_back(std::make_pair( sizeof(cl_int), (void*)&mask_strip)); - args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold )); - args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows )); - args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols )); - args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count )); - args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data )); + args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eigMinMax.data )); + args.push_back(std::make_pair( sizeof(cl_float), (void*)&qualityLevel )); + args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig_mat.rows )); + args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig_mat.cols )); + args.push_back(std::make_pair( sizeof(cl_int), (void*)&corners.cols )); + args.push_back(std::make_pair( sizeof(cl_mem), (void*)&counter.data )); - size_t globalThreads[3] = {eig.cols, eig.rows, 1}; + size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1}; size_t localThreads[3] = {16, 16, 1}; + if(!mask.empty()) + opt += " -D WITH_MASK=1"; - const char * opt = mask.empty() ? "" : "-D WITH_MASK"; - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt); - return std::min(Mat(g_counter).at(0), max_count); + openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str()); +} + + +static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero) +{ + size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits; + CV_Assert(groupnum != 0); + + int dbsize = groupnum * 2 * src.elemSize(); + + ensureSizeIsEnough(1, dbsize, CV_8UC1, dst); + + cl_mem dst_data = reinterpret_cast(dst.data); + + int all_cols = src.step / src.elemSize(); + int pre_cols = (src.offset % src.step) / src.elemSize(); + int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1; + int invalid_cols = pre_cols + sec_cols; + int cols = all_cols - invalid_cols , elemnum = cols * src.rows; + int offset = src.offset / src.elemSize(); + + {// first parallel pass + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum)); + size_t globalThreads[3] = {groupnum * 256, 1, 1}; + size_t localThreads[3] = {256, 1, 1}; + openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads, + args, -1, -1, "-D T=float -D DEPTH_5"); + } + + {// run final "serial" kernel to find accumulate results from threads and reset corner counter + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&tozero.data )); + size_t globalThreads[3] = {1, 1, 1}; + size_t localThreads[3] = {1, 1, 1}; + openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads, + args, -1, -1); + } } -}//unnamed namespace void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask) { @@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, ensureSizeIsEnough(image.size(), CV_32F, eig_); if (useHarrisDetector) - cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK); + cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK); else cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3); - double maxVal = 0; - minMax(eig_, NULL, &maxVal); + ensureSizeIsEnough(1,1, CV_32SC1, counter_); - ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); + // find max eigenvalue and reset detected counters + minMaxEig_caller(eig_,eig_minmax_,counter_); - Ptr eig_tex = bindTexturePtr(eig_); - int total = findCorners_caller( - *eig_tex, - static_cast(maxVal * qualityLevel), + // allocate buffer for kernels + int corner_array_size = std::max(1024, static_cast(image.size().area() * 0.05)); + + if(!use_cpu_sorter) + { // round to 2^n + unsigned int n=1; + for(n=1;n<(unsigned int)corner_array_size;n<<=1); + corner_array_size = (int)n; + + ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_); + + // set to 0 to be able use bitonic sort on whole 2^n array + tmpCorners_.setTo(0); + } + else + { + ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_); + } + + int total = tmpCorners_.cols; // by default the number of corner is full array + std::vector tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm + + //find points with high eigenvalue and put it into the output array + findCorners_caller( + eig_, + eig_minmax_, + static_cast(qualityLevel), mask, tmpCorners_, - tmpCorners_.cols); + counter_); + + if(!use_cpu_sorter) + {// sort detected corners on deivce side + sortCorners_caller(tmpCorners_, corner_array_size); + } + else + {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side + openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL)); + } + + //blocking read whole corners array (sorted or not sorted) + openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner)); if (total == 0) - { + {// check for trivial case corners.release(); return; } + if(use_cpu_sorter) - { - Sorter::sortCorners_caller(eig_, tmpCorners_, total); - } - else - { - //if total is power of 2 - if(((total - 1) & (total)) == 0) - { - Sorter::sortCorners_caller(*eig_tex, tmpCorners_, total); - } - else - { - Sorter::sortCorners_caller(*eig_tex, tmpCorners_, total); - } + {// sort detected corners on cpu side. + tmp.resize(total); + std::sort(tmp.begin(), tmp.end(), DefCornerCompare()); } + //estimate maximal size of final output array + int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total; + int D2 = (int)ceil(minDistance * minDistance); + // allocate output buffer + std::vector tmp2; + tmp2.reserve(total_max); + + if (minDistance < 1) - { - Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1); - tmpCorners_(roi_range).copyTo(corners); + {// we have not distance restriction. then just copy with conversion maximal allowed points into output array + for(int i=0;i0.0f;++i) + { + tmp2.push_back(Point2f(tmp[i].x,tmp[i].y)); + } } else - { - std::vector tmp(total); - downloadPoints(tmpCorners_, tmp); - - std::vector tmp2; - tmp2.reserve(total); - + {// we have distance restriction. then start coping to output array from the first element and check distance for each next one const int cell_size = cvRound(minDistance); const int grid_width = (image.cols + cell_size - 1) / cell_size; const int grid_height = (image.rows + cell_size - 1) / cell_size; - std::vector< std::vector > grid(grid_width * grid_height); + std::vector< std::vector > grid(grid_width * grid_height); - for (int i = 0; i < total; ++i) + for (int i = 0; i < total ; ++i) { - Point2f p = tmp[i]; + DefCorner p = tmp[i]; + + if(p.eig<=0.0f) + break; // condition to stop that is needed for GPU bitonic sort usage. bool good = true; @@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, { for (int xx = x1; xx <= x2; xx++) { - std::vector& m = grid[yy * grid_width + xx]; - - if (!m.empty()) + std::vector& m = grid[yy * grid_width + xx]; + if (m.empty()) + continue; + for(size_t j = 0; j < m.size(); j++) { - for(size_t j = 0; j < m.size(); j++) - { - float dx = p.x - m[j].x; - float dy = p.y - m[j].y; + int dx = p.x - m[j].x; + int dy = p.y - m[j].y; - if (dx * dx + dy * dy < minDistance * minDistance) - { - good = false; - goto break_out; - } + if (dx * dx + dy * dy < D2) + { + good = false; + goto break_out_; } } } } - break_out: + break_out_: if(good) { - grid[y_cell * grid_width + x_cell].push_back(p); + grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y)); - tmp2.push_back(p); + tmp2.push_back(Point2f(p.x,p.y)); if (maxCorners > 0 && tmp2.size() == static_cast(maxCorners)) break; } } - corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0])); } + int final_size = static_cast(tmp2.size()); + if(final_size>0) + corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0])); + else + corners.release(); } void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector &points_v) { diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index a023f8a04..d38b3bad9 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -866,16 +866,17 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vectoris_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE)) { - //setup local group size - localThreads[0] = 8; - localThreads[1] = 16; + //setup local group size for "pixel step" = 1 + localThreads[0] = 16; + localThreads[1] = 32; localThreads[2] = 1; - //init maximal number of workgroups + //calc maximal number of workgroups int WGNumX = 1+(sizev[0].width /(localThreads[0])); int WGNumY = 1+(sizev[0].height/(localThreads[1])); int WGNumZ = loopcount; - int WGNum = 0; //accurate number of non -empty workgroups + int WGNumTotal = 0; //accurate number of non-empty workgroups + int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2 oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U); { cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status); @@ -895,12 +896,16 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector=(Width-cascade->orig_window_size.width)) continue; // no data to process + if(scaleinfo[z].factor<=2) + { + WGNumSampled++; + } // save no-empty workgroup info into array - pWGInfo[WGNum].s[0] = scaleinfo[z].width_height; - pWGInfo[WGNum].s[1] = (gx << 16) | gy; - pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff; - memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float)); - WGNum++; + pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height; + pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy; + pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff; + memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float)); + WGNumTotal++; } } } @@ -908,13 +913,8 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vectororig_window_size.width); options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height); options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based); - options += format(" -D LSx=%d",localThreads[0]); - options += format(" -D LSy=%d",localThreads[1]); options += format(" -D SPLITNODE=%d",splitnode); options += format(" -D SPLITSTAGE=%d",splitstage); options += format(" -D OUTPUTSZ=%d",outputsz); @@ -972,8 +970,39 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vectorWGNumSampled) + {// small images and each pixel is processed + // setup global sizes to have linear array of workgroups with WGNum size + int pixelstep = 1; + size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1}; + globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled); + globalThreads[1] = LS[1]; + globalThreads[2] = 1; + String options1 = options; + options1 += format(" -D PIXEL_STEP=%d",pixelstep); + options1 += format(" -D WGSTART=%d",WGNumSampled); + options1 += format(" -D LSx=%d",LS[0]); + options1 += format(" -D LSy=%d",LS[1]); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str()); + } + if(WGNumSampled>0) + {// large images each 4th pixel is processed + // setup global sizes to have linear array of workgroups with WGNum size + int pixelstep = 2; + size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1}; + globalThreads[0] = LS[0]*WGNumSampled; + globalThreads[1] = LS[1]; + globalThreads[2] = 1; + String options2 = options; + options2 += format(" -D PIXEL_STEP=%d",pixelstep); + options2 += format(" -D WGSTART=%d",0); + options2 += format(" -D LSx=%d",LS[0]); + options2 += format(" -D LSy=%d",LS[1]); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str()); + } //read candidate buffer back and put it into host list openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); assert(candidate[0]supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + qangle_type = CV_32SC2; + qangle_step_shift = 2; + } } void cv::ocl::device::hog::compute_hists(int nbins, @@ -1627,7 +1643,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int blocks_total = img_block_width * img_block_height; int grad_quadstep = grad.step >> 2; - int qangle_step = qangle.step; + int qangle_step = qangle.step >> qangle_step_shift; int blocks_in_group = 4; size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; @@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, char correctGamma = (correct_gamma) ? 1 : 0; int img_step = img.step; int grad_quadstep = grad.step >> 3; - int qangle_step = qangle.step >> 1; + int qangle_step = qangle.step >> (1 + qangle_step_shift); args.push_back( std::make_pair( sizeof(cl_int), (void *)&height)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&width)); @@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, char correctGamma = (correct_gamma) ? 1 : 0; int img_step = img.step >> 2; int grad_quadstep = grad.step >> 3; - int qangle_step = qangle.step >> 1; + int qangle_step = qangle.step >> (1 + qangle_step_shift); args.push_back( std::make_pair( sizeof(cl_int), (void *)&height)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&width)); diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index f730df10f..0ac627172 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -1035,67 +1035,117 @@ namespace cv else scale = 1. / scale; - if (ksize > 0) + const int sobel_lsz = 16; + if((src.type() == CV_8UC1 || src.type() == CV_32FC1) && + (ksize==3 || ksize==5 || ksize==7 || ksize==-1) && + src.wholerows > sobel_lsz + (ksize>>1) && + src.wholecols > sobel_lsz + (ksize>>1)) { - Context* clCxt = Context::getContext(); - if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 && - src.cols % 8 == 0 && src.rows % 8 == 0 && - ksize==3 && - (borderType ==cv::BORDER_REFLECT || - borderType == cv::BORDER_REPLICATE || - borderType ==cv::BORDER_REFLECT101 || - borderType ==cv::BORDER_WRAP)) + Dx.create(src.size(), CV_32FC1); + Dy.create(src.size(), CV_32FC1); + + CV_Assert(Dx.rows == Dy.rows && Dx.cols == Dy.cols); + + size_t lt2[3] = {sobel_lsz, sobel_lsz, 1}; + size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1}; + + unsigned int src_pitch = src.step; + unsigned int Dx_pitch = Dx.step; + unsigned int Dy_pitch = Dy.step; + + int src_offset_x = (src.offset % src.step) / src.elemSize(); + int src_offset_y = src.offset / src.step; + + float _scale = scale; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y )); + + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dx_pitch )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dy_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.rows )); + + args.push_back( std::make_pair( sizeof(cl_float), (void *)&_scale )); + + String option = cv::format("-D BLK_X=%d -D BLK_Y=%d",(int)lt2[0],(int)lt2[1]); + switch(src.type()) { - Dx.create(src.size(), CV_32FC1); - Dy.create(src.size(), CV_32FC1); - - const unsigned int block_x = 8; - const unsigned int block_y = 8; - - unsigned int src_pitch = src.step; - unsigned int dst_pitch = Dx.cols; - - float _scale = scale; - - std::vector > args; - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); - args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); - args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale )); - size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1}; - - String option = "-D BLK_X=8 -D BLK_Y=8"; - switch(borderType) - { - case cv::BORDER_REPLICATE: - option += " -D BORDER_REPLICATE"; - break; - case cv::BORDER_REFLECT: - option += " -D BORDER_REFLECT"; - break; - case cv::BORDER_REFLECT101: - option += " -D BORDER_REFLECT101"; - break; - case cv::BORDER_WRAP: - option += " -D BORDER_WRAP"; - break; - } - openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() ); + case CV_8UC1: + option += " -D SRCTYPE=uchar"; + break; + case CV_32FC1: + option += " -D SRCTYPE=float"; + break; } - else + switch(borderType) + { + case cv::BORDER_CONSTANT: + option += " -D BORDER_CONSTANT"; + break; + case cv::BORDER_REPLICATE: + option += " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: + option += " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: + option += " -D BORDER_REFLECT_101"; + break; + case cv::BORDER_WRAP: + option += " -D BORDER_WRAP"; + break; + default: + CV_Error(CV_StsBadFlag, "BORDER type is not supported!"); + break; + } + + String kernel_name; + switch(ksize) + { + case -1: + option += " -D SCHARR"; + kernel_name = "sobel3"; + break; + case 3: + kernel_name = "sobel3"; + break; + case 5: + kernel_name = "sobel5"; + break; + case 7: + kernel_name = "sobel7"; + break; + default: + CV_Error(CV_StsBadFlag, "Kernel size is not supported!"); + break; + } + openCLExecuteKernel(src.clCxt, &imgproc_sobel3, kernel_name, gt2, lt2, args, -1, -1, option.c_str() ); + } + else + { + if (ksize > 0) { Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); } - } - else - { - Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType); - Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType); + else + { + Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType); + Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType); + } } CV_Assert(Dx.offset == 0 && Dy.offset == 0); } diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl index bf3b6cfa7..5c236f0e0 100644 --- a/modules/ocl/src/opencl/cvt_color.cl +++ b/modules/ocl/src/opencl/cvt_color.cl @@ -56,35 +56,59 @@ #ifdef DEPTH_0 #define DATA_TYPE uchar +#define VECTOR2 uchar2 +#define VECTOR4 uchar4 +#define VECTOR8 uchar8 +#define VECTOR16 uchar16 #define COEFF_TYPE int #define MAX_NUM 255 #define HALF_MAX 128 #define SAT_CAST(num) convert_uchar_sat_rte(num) +#define SAT_CAST2(num) convert_uchar2_sat(num) +#define SAT_CAST4(num) convert_uchar4_sat(num) #endif #ifdef DEPTH_2 #define DATA_TYPE ushort +#define VECTOR2 ushort2 +#define VECTOR4 ushort4 +#define VECTOR8 ushort8 +#define VECTOR16 ushort16 #define COEFF_TYPE int #define MAX_NUM 65535 #define HALF_MAX 32768 #define SAT_CAST(num) convert_ushort_sat_rte(num) +#define SAT_CAST2(num) convert_ushort2_sat(num) +#define SAT_CAST4(num) convert_ushort4_sat(num) #endif #ifdef DEPTH_5 #define DATA_TYPE float +#define VECTOR2 float2 +#define VECTOR4 float4 +#define VECTOR8 float8 +#define VECTOR16 float16 #define COEFF_TYPE float #define MAX_NUM 1.0f #define HALF_MAX 0.5f #define SAT_CAST(num) (num) #endif +#ifndef bidx + #define bidx 0 +#endif + +#ifndef pixels_per_work_item + #define pixels_per_work_item 1 +#endif + #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) enum { yuv_shift = 14, xyz_shift = 12, - hsv_shift = 12, + hsv_shift = 12, R2Y = 4899, G2Y = 9617, B2Y = 1868, @@ -93,26 +117,87 @@ enum ///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// +__constant float c_RGB2GrayCoeffs_f[3] = { 0.114f, 0.587f, 0.299f }; +__constant int c_RGB2GrayCoeffs_i[3] = { B2Y, G2Y, R2Y }; + __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) { int src_idx = mad24(y, src_step, src_offset + (x << 2)); int dst_idx = mad24(y, dst_step, dst_offset + x); + +#ifndef INTEL_DEVICE + #ifdef DEPTH_5 dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f; #else dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift); #endif + +#else //INTEL_DEVICE + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#ifdef DEPTH_5 + __constant float * coeffs = c_RGB2GrayCoeffs_f; +#else + __constant int * coeffs = c_RGB2GrayCoeffs_i; +#endif + +#if (1 == pixels_per_work_item) + { +#ifdef DEPTH_5 + *dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2]; +#else + *dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift); +#endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2]; +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift); + const VECTOR2 Y = SAT_CAST2(yi); +#endif + + vstore2(Y, 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + const int4 Y = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift); + + vstore4(SAT_CAST4(Y), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item +#endif //INTEL_DEVICE } } -__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { @@ -140,10 +225,10 @@ __constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877 __constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -151,24 +236,84 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, x <<= 2; int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YUVCoeffs_f; - DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; - DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; #else __constant int * coeffs = c_RGB2YUVCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); - int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); + const int delta = HALF_MAX * (1 << yuv_shift); #endif - dst[dst_idx] = SAT_CAST( Y ); - dst[dst_idx + 1] = SAT_CAST( Cr ); - dst[dst_idx + 2] = SAT_CAST( Cb ); +#if (1 == pixels_per_work_item) + { + const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; + float U = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; + float V = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; +#else + int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); + int U = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); + int V = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst_ptr[0] = SAT_CAST( Y ); + dst_ptr[1] = SAT_CAST( U ); + dst_ptr[2] = SAT_CAST( V ); + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]); + const float2 U = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX); + const float2 V = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX); +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 U = SAT_CAST2(ui); + const VECTOR2 V = SAT_CAST2(vi); +#endif + + vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + + const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 U = SAT_CAST4(ui); + const VECTOR4 V = SAT_CAST4(vi); + + vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item } } @@ -176,10 +321,10 @@ __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; __constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -187,26 +332,94 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, x <<= 2; int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_YUV2RGBCoeffs_f; - float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3]; - float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1]; - float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0]; #else __constant int * coeffs = c_YUV2RGBCoeffs_i; - int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift); - int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift); - int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif - dst[dst_idx + bidx] = SAT_CAST( b ); - dst[dst_idx + 1] = SAT_CAST( g ); - dst[dst_idx + (bidx^2)] = SAT_CAST( r ); -#if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; +#if (1 == pixels_per_work_item) + { + const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float B = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3]; + float G = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1]; + float R = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0]; +#else + int B = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift); + int G = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift); + int R = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif + + dst_ptr[bidx] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[(bidx^2)] = SAT_CAST( R ); +#if dcn == 4 + dst_ptr[3] = MAX_NUM; +#endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 Y = r0.s04; + const float2 U = r0.s15; + const float2 V = r0.s26; + + const float2 c0 = (bidx == 0) ? (Y + (V - HALF_MAX) * coeffs[3]) : (Y + (U - HALF_MAX) * coeffs[0]); + const float2 c1 = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; + const float2 c2 = (bidx == 0) ? (Y + (U - HALF_MAX) * coeffs[0]) : (Y + (V - HALF_MAX) * coeffs[3]); +#else + const int2 Y = convert_int2(r0.s04); + const int2 U = convert_int2(r0.s15); + const int2 V = convert_int2(r0.s26); + + const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)); + const int2 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR2 c0 = SAT_CAST2(c0i); + const VECTOR2 c1 = SAT_CAST2(c1i); + const VECTOR2 c2 = SAT_CAST2(c2i); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); +#endif + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 Y = convert_int4(r0.s048c); + const int4 U = convert_int4(r0.s159d); + const int4 V = convert_int4(r0.s26ae); + + const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)); + const int4 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR4 c0 = SAT_CAST4(c0i); + const VECTOR4 c1 = SAT_CAST4(c1i); + const VECTOR4 c2 = SAT_CAST4(c2i); + +#if dcn == 4 + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr); +#endif +#endif + } +#endif //pixels_per_work_item } } @@ -218,7 +431,7 @@ __constant int ITUR_BT_601_CVR = 1673527; __constant int ITUR_BT_601_SHIFT = 20; __kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step, - int bidx, __global const uchar* src, __global uchar* dst, + __global const uchar* src, __global uchar* dst, int src_offset, int dst_offset) { const int x = get_global_id(0); @@ -275,10 +488,10 @@ __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564 __constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, - int src_offset, int dst_offset) + __global const DATA_TYPE* src, __global DATA_TYPE* dst, + int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -287,24 +500,82 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YCrCbCoeffs_f; - DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; - DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; #else __constant int * coeffs = c_RGB2YCrCbCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); - int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); + const int delta = HALF_MAX * (1 << yuv_shift); #endif - dst[dst_idx] = SAT_CAST( Y ); - dst[dst_idx + 1] = SAT_CAST( Cr ); - dst[dst_idx + 2] = SAT_CAST( Cb ); +#if (1 == pixels_per_work_item) + { + const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; + float Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; + float Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; +#else + int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); + int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); + int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst_ptr[0] = SAT_CAST( Y ); + dst_ptr[1] = SAT_CAST( Cr ); + dst_ptr[2] = SAT_CAST( Cb ); + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]); + const float2 Cr = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX); + const float2 Cb = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX); +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 Cr = SAT_CAST2(ui); + const VECTOR2 Cb = SAT_CAST2(vi); +#endif + + vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + + const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 Cr = SAT_CAST4(ui); + const VECTOR4 Cb = SAT_CAST4(vi); + + vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item } } @@ -312,10 +583,10 @@ __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f }; __constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 }; __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, - int src_offset, int dst_offset) + __global const DATA_TYPE* src, __global DATA_TYPE* dst, + int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -324,36 +595,103 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 - __constant float * coeff = c_YCrCb2RGBCoeffs_f; - float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX); - float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX); - float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX); + __constant float * coeffs = c_YCrCb2RGBCoeffs_f; #else - __constant int * coeff = c_YCrCb2RGBCoeffs_i; - int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift); - int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift); - int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift); + __constant int * coeffs = c_YCrCb2RGBCoeffs_i; #endif - dst[dst_idx + (bidx^2)] = SAT_CAST(r); - dst[dst_idx + 1] = SAT_CAST(g); - dst[dst_idx + bidx] = SAT_CAST(b); -#if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; +#if (1 == pixels_per_work_item) + { + const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float B = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[3]; + float G = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1]; + float R = ycrcb[0] + (ycrcb[1] - HALF_MAX) * coeffs[0]; +#else + int B = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[3], yuv_shift); + int G = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1], yuv_shift); + int R = ycrcb[0] + CV_DESCALE((ycrcb[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif + + dst_ptr[bidx] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[(bidx^2)] = SAT_CAST( R ); +#if dcn == 4 + dst_ptr[3] = MAX_NUM; +#endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 Y = r0.s04; + const float2 Cr = r0.s15; + const float2 Cb = r0.s26; + + const float2 c0 = (bidx == 0) ? (Y + (Cb - HALF_MAX) * coeffs[3]) : (Y + (Cr - HALF_MAX) * coeffs[0]); + const float2 c1 = Y + (Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1]; + const float2 c2 = (bidx == 0) ? (Y + (Cr - HALF_MAX) * coeffs[0]) : (Y + (Cb - HALF_MAX) * coeffs[3]); +#else + const int2 Y = convert_int2(r0.s04); + const int2 Cr = convert_int2(r0.s15); + const int2 Cb = convert_int2(r0.s26); + + const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)); + const int2 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift); + const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR2 c0 = SAT_CAST2(c0i); + const VECTOR2 c1 = SAT_CAST2(c1i); + const VECTOR2 c2 = SAT_CAST2(c2i); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); +#endif + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 Y = convert_int4(r0.s048c); + const int4 Cr = convert_int4(r0.s159d); + const int4 Cb = convert_int4(r0.s26ae); + + const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)); + const int4 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift); + const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR4 c0 = SAT_CAST4(c0i); + const VECTOR4 c1 = SAT_CAST4(c1i); + const VECTOR4 c2 = SAT_CAST4(c2i); + +#if dcn == 4 + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr); +#endif +#endif + } +#endif //pixels_per_work_item } } ///////////////////////////////////// RGB <-> XYZ ////////////////////////////////////// __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs) { - int dx = get_global_id(0); + int dx = get_global_id(0) * pixels_per_work_item; int dy = get_global_id(1); if (dy < rows && dx < cols) @@ -362,28 +700,84 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(dy, src_step, src_offset + dx); int dst_idx = mad24(dy, dst_step, dst_offset + dx); - DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#if (1 == pixels_per_work_item) + { + DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2]; #ifdef DEPTH_5 - float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; - float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; - float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; + float X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2]; + float Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5]; + float Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8]; #else - int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); - int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); - int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); + int X = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + int Y = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + int Z = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); #endif - dst[dst_idx] = SAT_CAST(x); - dst[dst_idx + 1] = SAT_CAST(y); - dst[dst_idx + 2] = SAT_CAST(z); + + dst_ptr[0] = SAT_CAST( X ); + dst_ptr[1] = SAT_CAST( Y ); + dst_ptr[2] = SAT_CAST( Z ); + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 R = r0.s04; + const float2 G = r0.s15; + const float2 B = r0.s26; + + const float2 X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2]; + const float2 Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5]; + const float2 Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8]; +#else + const int2 R = convert_int2(r0.s04); + const int2 G = convert_int2(r0.s15); + const int2 B = convert_int2(r0.s26); + + const int2 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + const int2 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + const int2 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); + + const VECTOR2 X = SAT_CAST2(xi); + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 Z = SAT_CAST2(zi); +#endif + + vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 R = convert_int4(r0.s048c); + const int4 G = convert_int4(r0.s159d); + const int4 B = convert_int4(r0.s26ae); + + const int4 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + const int4 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + const int4 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); + + const VECTOR4 X = SAT_CAST4(xi); + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 Z = SAT_CAST4(zi); + + vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr); +#endif + } +#endif //pixels_per_work_item } } __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs) { - int dx = get_global_id(0); + int dx = get_global_id(0) * pixels_per_work_item; int dy = get_global_id(1); if (dy < rows && dx < cols) @@ -392,23 +786,87 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(dy, src_step, src_offset + dx); int dst_idx = mad24(dy, dst_step, dst_offset + dx); - DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#if (1 == pixels_per_work_item) + { + const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2]; #ifdef DEPTH_5 - float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; - float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; - float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; + float B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2]; + float G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5]; + float R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8]; #else - int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); - int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); - int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); + int B = CV_DESCALE(X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2], xyz_shift); + int G = CV_DESCALE(X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5], xyz_shift); + int R = CV_DESCALE(X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8], xyz_shift); #endif - dst[dst_idx] = SAT_CAST(b); - dst[dst_idx + 1] = SAT_CAST(g); - dst[dst_idx + 2] = SAT_CAST(r); + + dst_ptr[0] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[2] = SAT_CAST( R ); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst_ptr[3] = MAX_NUM; #endif + } +#elif (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 X = r0.s04; + const float2 Y = r0.s15; + const float2 Z = r0.s26; + + float2 B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2]; + float2 G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5]; + float2 R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8]; +#else + const int2 xi = convert_int2(r0.s04); + const int2 yi = convert_int2(r0.s15); + const int2 zi = convert_int2(r0.s26); + + const int2 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift); + const int2 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift); + const int2 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift); + + const VECTOR2 R = SAT_CAST2(ri); + const VECTOR2 G = SAT_CAST2(gi); + const VECTOR2 B = SAT_CAST2(bi); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr); +#endif + } +#elif (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 xi = convert_int4(r0.s048c); + const int4 yi = convert_int4(r0.s159d); + const int4 zi = convert_int4(r0.s26ae); + + const int4 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift); + const int4 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift); + const int4 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift); + + const VECTOR4 R = SAT_CAST4(ri); + const VECTOR4 G = SAT_CAST4(gi); + const VECTOR4 B = SAT_CAST4(bi); + +#if dcn == 4 + vstore16((VECTOR16)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM, B.s2, G.s2, R.s2, MAX_NUM, B.s3, G.s3, R.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0, B.s2, G.s2, R.s2, 0, B.s3, G.s3, R.s3, 0), 0, dst_ptr); +#endif +#endif + } +#endif // pixels_per_work_item } } @@ -427,6 +885,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); +#ifndef INTEL_DEVICE #ifdef REVERSE dst[dst_idx] = src[src_idx + 2]; dst[dst_idx + 1] = src[src_idx + 1]; @@ -444,12 +903,43 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, dst[dst_idx + 3] = src[src_idx + 3]; #endif #endif +#else //INTEL_DEVICE + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + const VECTOR4 r0 = vload4(0, src_ptr); +#ifdef REVERSE + if (3 == dcn) + { + vstore4((VECTOR4)(r0.s210, 0), 0, dst_ptr); + } + else if (3 == scn) + { + vstore4((VECTOR4)(r0.s210, MAX_NUM), 0, dst_ptr); + } + else { + vstore4((VECTOR4)(r0.s2103), 0, dst_ptr); + } +#elif defined ORDER + if (3 == dcn) + { + vstore4((VECTOR4)(r0.s012, 0), 0, dst_ptr); + } + else if (3 == scn) + { + vstore4((VECTOR4)(r0.s012, MAX_NUM), 0, dst_ptr); + } + else { + vstore4(r0, 0, dst_ptr); + } +#endif +#endif //INTEL_DEVICE } } ///////////////////////////////////// RGB5x5 <-> RGB ////////////////////////////////////// -__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, __global const ushort * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -482,7 +972,7 @@ __kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bid } } -__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global ushort * dst, int src_offset, int dst_offset) { @@ -507,7 +997,7 @@ __kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bid ///////////////////////////////////// RGB5x5 <-> RGB ////////////////////////////////////// -__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, __global const ushort * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -532,7 +1022,7 @@ __kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bi } } -__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global ushort * dst, int src_offset, int dst_offset) { @@ -560,7 +1050,7 @@ __constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2, #ifdef DEPTH_0 -__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset, __constant int * sdiv_table, __constant int * hdiv_table) @@ -600,7 +1090,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -656,7 +1146,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #elif defined DEPTH_5 -__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -698,7 +1188,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -758,7 +1248,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #ifdef DEPTH_0 -__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -805,7 +1295,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -860,7 +1350,7 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #elif defined DEPTH_5 -__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -907,7 +1397,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -968,33 +1458,10 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #ifdef DEPTH_0 __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, - int bidx, __global const uchar * src, __global uchar * dst, - int src_offset, int dst_offset) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (y < rows && x < cols) - { - x <<= 2; - int src_idx = mad24(y, src_step, src_offset + x); - int dst_idx = mad24(y, dst_step, dst_offset + x); - - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; - - dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 3] = v3; - } -} - -__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -1003,14 +1470,129 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bid int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; - uchar v3_half = v3 / 2; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); - dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 3] = v3; +#if (1 == pixels_per_work_item) + { + const uchar4 r0 = vload4(0, src_ptr); + + dst_ptr[0] = (r0.s0 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[1] = (r0.s1 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[3] = r0.s3; + } +#elif (2 == pixels_per_work_item) + { + const uchar8 r0 = vload8(0, src_ptr); + + const int2 v0 = convert_int2(r0.s04); + const int2 v1 = convert_int2(r0.s15); + const int2 v2 = convert_int2(r0.s26); + const int2 v3 = convert_int2(r0.s37); + + const int2 ri = (v0 * v3 + HALF_MAX) / MAX_NUM; + const int2 gi = (v1 * v3 + HALF_MAX) / MAX_NUM; + const int2 bi = (v2 * v3 + HALF_MAX) / MAX_NUM; + + const uchar2 r = convert_uchar2(ri); + const uchar2 g = convert_uchar2(gi); + const uchar2 b = convert_uchar2(bi); + + vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { + const uchar16 r0 = vload16(0, src_ptr); + + const int4 v0 = convert_int4(r0.s048c); + const int4 v1 = convert_int4(r0.s159d); + const int4 v2 = convert_int4(r0.s26ae); + const int4 v3 = convert_int4(r0.s37bf); + + const int4 ri = (v0 * v3 + HALF_MAX) / MAX_NUM; + const int4 gi = (v1 * v3 + HALF_MAX) / MAX_NUM; + const int4 bi = (v2 * v3 + HALF_MAX) / MAX_NUM; + + const uchar4 r = convert_uchar4(ri); + const uchar4 g = convert_uchar4(gi); + const uchar4 b = convert_uchar4(bi); + + vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); + } +#endif // pixels_per_work_item + } +} + +__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, + __global const uchar * src, __global uchar * dst, + int src_offset, int dst_offset) +{ + int x = get_global_id(0) * pixels_per_work_item; + int y = get_global_id(1); + + if (y < rows && x < cols) + { + x <<= 2; + int src_idx = mad24(y, src_step, src_offset + x); + int dst_idx = mad24(y, dst_step, dst_offset + x); + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#if (1 == pixels_per_work_item) + { + const uchar4 r0 = vload4(0, src_ptr); + const uchar v3_half = r0.s3 / 2; + + const uchar r = (r0.s3 == 0) ? 0 : (r0.s0 * MAX_NUM + v3_half) / r0.s3; + const uchar g = (r0.s3 == 0) ? 0 : (r0.s1 * MAX_NUM + v3_half) / r0.s3; + const uchar b = (r0.s3 == 0) ? 0 : (r0.s2 * MAX_NUM + v3_half) / r0.s3; + + vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr); + } +#elif (2 == pixels_per_work_item) + { + const uchar8 r0 = vload8(0, src_ptr); + + const int2 v0 = convert_int2(r0.s04); + const int2 v1 = convert_int2(r0.s15); + const int2 v2 = convert_int2(r0.s26); + const int2 v3 = convert_int2(r0.s37); + const int2 v3_half = v3 / 2; + + const int2 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3; + const int2 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3; + const int2 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3; + + const uchar2 r = convert_uchar2(ri); + const uchar2 g = convert_uchar2(gi); + const uchar2 b = convert_uchar2(bi); + + vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); + } +#elif (4 == pixels_per_work_item) + { + const uchar16 r0 = vload16(0, src_ptr); + + const int4 v0 = convert_int4(r0.s048c); + const int4 v1 = convert_int4(r0.s159d); + const int4 v2 = convert_int4(r0.s26ae); + const int4 v3 = convert_int4(r0.s37bf); + const int4 v3_half = v3 / 2; + + + const int4 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3; + const int4 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3; + const int4 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3; + + const uchar4 r = convert_uchar4(ri); + const uchar4 g = convert_uchar4(gi); + const uchar4 b = convert_uchar4(bi); + + vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); + } +#endif // pixels_per_work_item } } diff --git a/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl new file mode 100644 index 000000000..c6555bff0 --- /dev/null +++ b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl @@ -0,0 +1,185 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2013, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define EXTRAPOLATE(x, maxV) +#elif defined BORDER_REPLICATE +//aaaaaa|abcdefgh|hhhhhhh +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = max(min((x), (maxV) - 1), 0); \ + } +#elif defined BORDER_WRAP +//cdefgh|abcdefgh|abcdefg +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = ( (x) + (maxV) ) % (maxV); \ + } +#elif defined BORDER_REFLECT +//fedcba|abcdefgh|hgfedcb +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ + } +#elif defined BORDER_REFLECT_101 +//gfedcb|abcdefgh|gfedcba +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ + } +#else +#error No extrapolation method +#endif + +#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x]) + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) +#else +#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) +#endif + +#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x]) + +//horizontal and vertical filter kernels +//should be defined on host during compile time to avoid overhead +__constant uint mat_kernelX[] = {KERNEL_MATRIX_X}; +__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y}; + +__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass + ( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* Dst, + const int DstOffset, + const uint DstPitch, + int width, + int height, + int dstWidth, + int dstHeight + ) +{ + //RADIUSX, RADIUSY are filter dimensions + //BLK_X, BLK_Y are local wrogroup sizes + //all these should be defined on host during compile time + //first lsmem array for source pixels used in first pass, + //second lsmemDy for storing first pass results + __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX]; + __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX]; + + //get local and global ids - used as image and local memory array indexes + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + //calculate pixel position in source image taking image offset into account + int srcX = x + srcOffsetX - RADIUSX; + int srcY = y + srcOffsetY - RADIUSY; + int xb = srcX; + int yb = srcY; + + //extrapolate coordinates, if needed + //and read my own source pixel into local memory + //with account for extra border pixels, which will be read by starting workitems + int clocY = liy; + int cSrcY = srcY; + do + { + int yb = cSrcY; + EXTRAPOLATE(yb, (height)); + + int clocX = lix; + int cSrcX = srcX; + do + { + int xb = cSrcX; + EXTRAPOLATE(xb,(width)); + lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 ); + + clocX += BLK_X; + cSrcX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + + clocY += BLK_Y; + cSrcY += BLK_Y; + } + while(clocY < BLK_Y+(RADIUSY*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + //do vertical filter pass + //and store intermediate results to second local memory array + int i; + WORKTYPE sum = 0.0f; + int clocX = lix; + do + { + sum = 0.0f; + for(i=0; i<=2*RADIUSY; i++) + sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum); + lsmemDy[liy][clocX] = sum; + clocX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + //if this pixel happened to be out of image borders because of global size rounding, + //then just return + if( x >= dstWidth || y >=dstHeight ) return; + + //do second horizontal filter pass + //and calculate final result + sum = 0.0f; + for(i=0; i<=2*RADIUSX; i++) + sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum); + + //store result into destination image + DST(x,y) = CONVERT_DSTTYPE(sum); +} diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl index 980e85dd2..d6e5fb9ba 100644 --- a/modules/ocl/src/opencl/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked( ) { -// this version used information provided for each workgroup -// no empty WG int gid = (int)get_group_id(0); int lid_x = (int)get_local_id(0); int lid_y = (int)get_local_id(1); int lid = lid_y*LSx+lid_x; - int4 WGInfo = pWGInfo[gid]; + int4 WGInfo = pWGInfo[WGSTART+gid]; int GroupX = (WGInfo.y >> 16)&0xFFFF; int GroupY = (WGInfo.y >> 0 )& 0xFFFF; int Width = (WGInfo.x >> 16)&0xFFFF; @@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked( int ImgOffset = WGInfo.z; float ScaleFactor = as_float(WGInfo.w); -#define DATA_SIZE_X (LSx+WND_SIZE_X) -#define DATA_SIZE_Y (LSy+WND_SIZE_Y) +#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X) +#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y) #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y) local int SumL[DATA_SIZE]; @@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked( int4 info1 = p; int4 info2 = pq; - { - int xl = lid_x; - int yl = lid_y; + // calc processed ROI coordinate in local mem + int xl = lid_x*PIXEL_STEP; + int yl = lid_y*PIXEL_STEP; + + {// calc variance_norm_factor for all stages int OffsetLocal = yl * DATA_SIZE_X + xl; int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl); @@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked( int result = (1.0f>0.0f); for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ ) - {// iterate until candidate is exist + {// iterate until candidate is valid float stage_sum = 0.0f; __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); + int lcl_off = (yl*DATA_SIZE_X)+(xl); int stagecount = stageinfo->count; float stagethreshold = stageinfo->threshold; - int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x); for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ ) { // simple macro to extract shorts from int @@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked( int4 n1 = pN[1]; int4 n2 = pN[2]; float nodethreshold = as_float(n2.y) * variance_norm_factor; - // calc sum of intensity pixels according to node information + // calc sum of intensity pixels according to classifier node information float classsum = (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) + (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) + @@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked( int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info if(index threshold) { float maxVal = val; + maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal); - - maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal); if (val == maxVal) { const int ind = atomic_inc(g_counter); if (ind < max_count) - corners[ind] = (float2)(j, i); + {// pack and store eigenvalue and its coordinates + corners[ind].x = val; + corners[ind].y = as_float(j|(i<<16)); + } } } } } +#undef GET_SRC_32F + //bitonic sort __kernel void sortCorners_bitonicSort ( - image2d_t eig, __global float2 * corners, const int count, const int stage, @@ -140,8 +136,8 @@ __kernel const float2 leftPt = corners[leftId]; const float2 rightPt = corners[rightId]; - const float leftVal = ELEM_FLT2(eig, leftPt); - const float rightVal = ELEM_FLT2(eig, rightPt); + const float leftVal = leftPt.x; + const float rightVal = rightPt.x; const bool compareResult = leftVal > rightVal; @@ -152,124 +148,22 @@ __kernel corners[rightId] = sortOrder ? greater : lesser; } -//selection sort for gfft -//kernel is ported from Bolt library: -//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl -// Local sort will firstly sort elements of each workgroup using selection sort -// its performance is O(n) -__kernel - void sortCorners_selectionSortLocal - ( - image2d_t eig, - __global float2 * corners, - const int count, - __local float2 * scratch - ) +// this is simple short serial kernel that makes some short reduction and initialization work +// it makes HOST like work to avoid additional sync with HOST to do this short work +// data - input/output float2. +// input data are sevral (min,max) pairs +// output data is one reduced (min,max) pair +// g_counter - counter that have to be initialized by 0 for next findCorner call. +__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter) { - int i = get_local_id(0); // index in workgroup - int numOfGroups = get_num_groups(0); // index in workgroup - int groupID = get_group_id(0); - int wg = get_local_size(0); // workgroup size = block size - int n; // number of elements to be processed for this work group - - int offset = groupID * wg; - int same = 0; - corners += offset; - n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; - float2 pt1, pt2; - - pt1 = corners[min(i, n)]; - scratch[i] = pt1; - barrier(CLK_LOCAL_MEM_FENCE); - - if(i >= n) + g_counter[0] = 0; + float minVal = data[0]; + float maxVal = data[groupnum]; + for(int i=1;i val1) - pos++;//calculate the rank of this element in this work group - else - { - if(val1 > val2) - continue; - else - { - // val1 and val2 are same - same++; - } - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} -__kernel - void sortCorners_selectionSortFinal - ( - image2d_t eig, - __global float2 * corners, - const int count - ) -{ - const int i = get_local_id(0); // index in workgroup - const int numOfGroups = get_num_groups(0); // index in workgroup - const int groupID = get_group_id(0); - const int wg = get_local_size(0); // workgroup size = block size - int pos = 0, same = 0; - const int offset = get_group_id(0) * wg; - const int remainder = count - wg*(numOfGroups-1); - - if((offset + i ) >= count) - return; - float2 pt1, pt2; - pt1 = corners[groupID*wg + i]; - - float val1 = ELEM_FLT2(eig, pt1); - float val2; - - for(int j=0; j val2) - break; - else - { - //Increment only if the value is not the same. - if( val2 > val1 ) - pos++; - else - same++; - } - } - } - - for(int k=0; k val2) - break; - else - { - //Don't increment if the value is the same. - //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false - if(val2 > val1) - pos++; - else - same++; - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} + data[0] = minVal; + data[1] = maxVal; +} \ No newline at end of file diff --git a/modules/ocl/src/opencl/imgproc_sobel3.cl b/modules/ocl/src/opencl/imgproc_sobel3.cl index d6a995f55..8356fce01 100644 --- a/modules/ocl/src/opencl/imgproc_sobel3.cl +++ b/modules/ocl/src/opencl/imgproc_sobel3.cl @@ -1,45 +1,97 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////Macro for border type//////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define EXTRAPOLATE(x, maxV) +#elif defined BORDER_REPLICATE +//aaaaaa|abcdefgh|hhhhhhh +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = max(min((x), (maxV) - 1), 0); \ + } +#elif defined BORDER_WRAP +//cdefgh|abcdefgh|abcdefg +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = ( (x) + (maxV) ) % (maxV); \ + } +#elif defined BORDER_REFLECT +//fedcba|abcdefgh|hgfedcb +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \ + } +#elif defined BORDER_REFLECT_101 +//gfedcb|abcdefgh|gfedcba +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \ + } +#else +#error No extrapolation method #endif -#ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x]) + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) +#else +#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) #endif -#ifdef BORDER_REFLECT101 -//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) -#endif +#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x]) +#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x]) -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) -#endif +#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \ + int srcX = x + srcOffsetX - (kernel_border); \ + int srcY = y + srcOffsetY - (kernel_border); \ + int xb = srcX; \ + int yb = srcY; \ + \ + EXTRAPOLATE(xb, (width)); \ + EXTRAPOLATE(yb, (height)); \ + lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + \ + if(lix < ((kernel_border)*2)) \ + { \ + int xb = srcX+BLK_X; \ + EXTRAPOLATE(xb,(width)); \ + lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + } \ + if(liy< ((kernel_border)*2)) \ + { \ + int yb = srcY+BLK_Y; \ + EXTRAPOLATE(yb, (height)); \ + lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + } \ + if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \ + { \ + int xb = srcX+BLK_X; \ + int yb = srcY+BLK_Y; \ + EXTRAPOLATE(xb,(width)); \ + EXTRAPOLATE(yb,(height)); \ + lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + } __kernel void sobel3( __global uchar* Src, - __global float* DstX, - __global float* DstY, - int width, int height, - uint srcStride, uint dstStride, - float scale + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* DstX, + const int DstXOffset, + const uint DstXPitch, + __global uchar* DstY, + const int DstYOffset, + const uint DstYPitch, + int width, + int height, + int dstWidth, + int dstHeight, + float scale ) { __local float lsmem[BLK_Y+2][BLK_X+2]; @@ -47,62 +99,249 @@ __kernel void sobel3( int lix = get_local_id(0); int liy = get_local_id(1); - int gix = get_group_id(0); - int giy = get_group_id(1); - - int id_x = get_global_id(0); - int id_y = get_global_id(1); - - lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]); - - int id_y_h = ADDR_H(id_y-1, 0,height); - int id_y_b = ADDR_B(id_y+1, height,id_y+1); - - int id_x_l = ADDR_L(id_x-1, 0,width); - int id_x_r = ADDR_R(id_x+1, width,id_x+1); - - if(liy==0) - { - lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]); - - if(lix==0) - lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]); - else if(lix==BLK_X-1) - lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]); - } - else if(liy==BLK_Y-1) - { - lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]); - - if(lix==0) - lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]); - else if(lix==BLK_X-1) - lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]); - } - - if(lix==0) - lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]); - else if(lix==BLK_X-1) - lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]); + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1) barrier(CLK_LOCAL_MEM_FENCE); + if( x >= dstWidth || y >=dstHeight ) return; + float u1 = lsmem[liy][lix]; float u2 = lsmem[liy][lix+1]; float u3 = lsmem[liy][lix+2]; float m1 = lsmem[liy+1][lix]; - float m2 = lsmem[liy+1][lix+1]; float m3 = lsmem[liy+1][lix+2]; float b1 = lsmem[liy+2][lix]; float b2 = lsmem[liy+2][lix+1]; float b3 = lsmem[liy+2][lix+2]; - //m2 * scale;// - float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 ); - DstX[ id_y * dstStride + id_x ] = dx * scale; + //calc and store dx and dy;// +#ifdef SCHARR + DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale; + DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale; +#else + DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale; + DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale; +#endif +} - float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3); - DstY[ id_y * dstStride + id_x ] = dy * scale; -} \ No newline at end of file +__kernel void sobel5( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* DstX, + const int DstXOffset, + const uint DstXPitch, + __global uchar* DstY, + const int DstYOffset, + const uint DstYPitch, + int width, + int height, + int dstWidth, + int dstHeight, + float scale + ) +{ + __local float lsmem[BLK_Y+4][BLK_X+4]; + + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2) + barrier(CLK_LOCAL_MEM_FENCE); + + if( x >= dstWidth || y >=dstHeight ) return; + + float t1 = lsmem[liy][lix]; + float t2 = lsmem[liy][lix+1]; + float t3 = lsmem[liy][lix+2]; + float t4 = lsmem[liy][lix+3]; + float t5 = lsmem[liy][lix+4]; + + float u1 = lsmem[liy+1][lix]; + float u2 = lsmem[liy+1][lix+1]; + float u3 = lsmem[liy+1][lix+2]; + float u4 = lsmem[liy+1][lix+3]; + float u5 = lsmem[liy+1][lix+4]; + + float m1 = lsmem[liy+2][lix]; + float m2 = lsmem[liy+2][lix+1]; + float m4 = lsmem[liy+2][lix+3]; + float m5 = lsmem[liy+2][lix+4]; + + float l1 = lsmem[liy+3][lix]; + float l2 = lsmem[liy+3][lix+1]; + float l3 = lsmem[liy+3][lix+2]; + float l4 = lsmem[liy+3][lix+3]; + float l5 = lsmem[liy+3][lix+4]; + + float b1 = lsmem[liy+4][lix]; + float b2 = lsmem[liy+4][lix+1]; + float b3 = lsmem[liy+4][lix+2]; + float b4 = lsmem[liy+4][lix+3]; + float b5 = lsmem[liy+4][lix+4]; + + //calc and store dx and dy;// + DSTX(x,y) = scale * + mad(12.0f, m4 - m2, + mad(6.0f, m5 - m1, + mad(8.0f, u4 - u2 + l4 - l2, + mad(4.0f, u5 - u1 + l5 - l1, + mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 ) + ) + ) + ) + ); + + DSTY(x,y) = scale * + mad(12.0f, l3 - u3, + mad(6.0f, b3 - t3, + mad(8.0f, l2 - u2 + l4 - u4, + mad(4.0f, b2 - t2 + b4 - t4, + mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 ) + ) + ) + ) + ); +} + +__kernel void sobel7( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* DstX, + const int DstXOffset, + const uint DstXPitch, + __global uchar* DstY, + const int DstYOffset, + const uint DstYPitch, + int width, + int height, + int dstWidth, + int dstHeight, + float scale + ) +{ + __local float lsmem[BLK_Y+6][BLK_X+6]; + + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3) + barrier(CLK_LOCAL_MEM_FENCE); + + if( x >= dstWidth || y >=dstHeight ) return; + + float tt1 = lsmem[liy][lix]; + float tt2 = lsmem[liy][lix+1]; + float tt3 = lsmem[liy][lix+2]; + float tt4 = lsmem[liy][lix+3]; + float tt5 = lsmem[liy][lix+4]; + float tt6 = lsmem[liy][lix+5]; + float tt7 = lsmem[liy][lix+6]; + + float t1 = lsmem[liy+1][lix]; + float t2 = lsmem[liy+1][lix+1]; + float t3 = lsmem[liy+1][lix+2]; + float t4 = lsmem[liy+1][lix+3]; + float t5 = lsmem[liy+1][lix+4]; + float t6 = lsmem[liy+1][lix+5]; + float t7 = lsmem[liy+1][lix+6]; + + float u1 = lsmem[liy+2][lix]; + float u2 = lsmem[liy+2][lix+1]; + float u3 = lsmem[liy+2][lix+2]; + float u4 = lsmem[liy+2][lix+3]; + float u5 = lsmem[liy+2][lix+4]; + float u6 = lsmem[liy+2][lix+5]; + float u7 = lsmem[liy+2][lix+6]; + + float m1 = lsmem[liy+3][lix]; + float m2 = lsmem[liy+3][lix+1]; + float m3 = lsmem[liy+3][lix+2]; + float m5 = lsmem[liy+3][lix+4]; + float m6 = lsmem[liy+3][lix+5]; + float m7 = lsmem[liy+3][lix+6]; + + float l1 = lsmem[liy+4][lix]; + float l2 = lsmem[liy+4][lix+1]; + float l3 = lsmem[liy+4][lix+2]; + float l4 = lsmem[liy+4][lix+3]; + float l5 = lsmem[liy+4][lix+4]; + float l6 = lsmem[liy+4][lix+5]; + float l7 = lsmem[liy+4][lix+6]; + + float b1 = lsmem[liy+5][lix]; + float b2 = lsmem[liy+5][lix+1]; + float b3 = lsmem[liy+5][lix+2]; + float b4 = lsmem[liy+5][lix+3]; + float b5 = lsmem[liy+5][lix+4]; + float b6 = lsmem[liy+5][lix+5]; + float b7 = lsmem[liy+5][lix+6]; + + float bb1 = lsmem[liy+6][lix]; + float bb2 = lsmem[liy+6][lix+1]; + float bb3 = lsmem[liy+6][lix+2]; + float bb4 = lsmem[liy+6][lix+3]; + float bb5 = lsmem[liy+6][lix+4]; + float bb6 = lsmem[liy+6][lix+5]; + float bb7 = lsmem[liy+6][lix+6]; + + //calc and store dx and dy + DSTX(x,y) = scale * + mad(100.0f, m5 - m3, + mad(80.0f, m6 - m2, + mad(20.0f, m7 - m1, + mad(75.0f, u5 - u3 + l5 - l3, + mad(60.0f, u6 - u2 + l6 - l2, + mad(15.0f, u7 - u1 + l7 - l1, + mad(30.0f, t5 - t3 + b5 - b3, + mad(24.0f, t6 - t2 + b6 - b2, + mad(6.0f, t7 - t1 + b7 - b1, + mad(5.0f, tt5 - tt3 + bb5 - bb3, + mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ); + + DSTY(x,y) = scale * + mad(100.0f, l4 - u4, + mad(80.0f, b4 - t4, + mad(20.0f, bb4 - tt4, + mad(75.0f, l5 - u5 + l3 - u3, + mad(60.0f, b5 - t5 + b3 - t3, + mad(15.0f, bb5 - tt5 + bb3 - tt3, + mad(30.0f, l6 - u6 + l2 - u2, + mad(24.0f, b6 - t6 + b2 - t2, + mad(6.0f, bb6 - tt6 + bb2 - tt2, + mad(5.0f, l7 - u7 + l1 - u1, + mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ); +} diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 0d2f26f96..60d7346e5 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -50,6 +50,14 @@ #define NTHREADS 256 #define CV_PI_F 3.1415926535897932384626433832795f +#ifdef INTEL_DEVICE +#define QANGLE_TYPE int +#define QANGLE_TYPE2 int2 +#else +#define QANGLE_TYPE uchar +#define QANGLE_TYPE2 uchar2 +#endif + //---------------------------------------------------------------------------- // Histogram computation // 12 threads for a cell, 12x4 threads per block @@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel( const int cnbins, const int cblock_hist_size, const int img_block_width, const int blocks_in_group, const int blocks_total, const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, + __global const float* grad, __global const QANGLE_TYPE* qangle, __global const float* gauss_w_lut, __global float* block_hists, __local float* smem) { @@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel( __global const float* grad_ptr = (gid < blocks_total) ? grad + offset_y * grad_quadstep + (offset_x << 1) : grad; - __global const uchar* qangle_ptr = (gid < blocks_total) ? + __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ? qangle + offset_y * qangle_step + (offset_x << 1) : qangle; __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + @@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel( for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) { float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]); grad_ptr += grad_quadstep; qangle_ptr += qangle_step; @@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel( __kernel void compute_gradients_8UC4_kernel( const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle, const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); @@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel( __kernel void compute_gradients_8UC1_kernel( const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, + __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle, const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 0472815bf..318f9e0a0 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -116,7 +116,7 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool Mat m(size0, type); - rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal)); + rng.fill(m, RNG::UNIFORM, minVal, maxVal); if( size0 == size ) return m; return m(Rect((size0.width-size.width)/2, (size0.height-size.height)/2, size.width, size.height)); @@ -142,7 +142,7 @@ Mat randomMat(RNG& rng, const vector& size, int type, double minVal, double Mat m(dims, &size0[0], type); - rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal)); + rng.fill(m, RNG::UNIFORM, minVal, maxVal); if( eqsize ) return m; return m(&r[0]); diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake index c6b0469ad..2c5b7406d 100644 --- a/platforms/linux/arm-gnueabi.toolchain.cmake +++ b/platforms/linux/arm-gnueabi.toolchain.cmake @@ -28,14 +28,11 @@ set(CMAKE_MODULE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-s set(CMAKE_EXE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_EXE_LINKER_FLAGS}") if(USE_NEON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon") + message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." ) + set(ENABLE_NEON TRUE) elseif(USE_VFPV3) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3") -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3-d16") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3-d16") + message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." ) + set(ENABLE_VFPV3 TRUE) endif() set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT}) diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp new file mode 100644 index 000000000..40349e0fb --- /dev/null +++ b/samples/cpp/intelperc_capture.cpp @@ -0,0 +1,376 @@ +// testOpenCVCam.cpp : Defines the entry point for the console application. +// + +#include "opencv2/highgui/highgui.hpp" + +#include + +using namespace cv; +using namespace std; + +static bool g_printStreamSetting = false; +static int g_imageStreamProfileIdx = -1; +static int g_depthStreamProfileIdx = -1; +static bool g_irStreamShow = false; +static double g_imageBrightness = -DBL_MAX; +static double g_imageContrast = -DBL_MAX; +static bool g_printTiming = false; +static bool g_showClosedPoint = false; + + +static int g_closedDepthPoint[2]; + +static void printUsage(const char *arg0) +{ + const char *filename = arg0; + while (*filename) + filename++; + while ((arg0 <= filename) && ('\\' != *filename) && ('/' != *filename)) + filename--; + filename++; + + cout << "This program demonstrates usage of camera supported\nby Intel Perceptual computing SDK." << endl << endl; + cout << "usage: " << filename << "[-ps] [-isp IDX] [-dsp IDX]\n [-ir] [-imb VAL] [-imc VAL]" << endl << endl; + cout << " -ps, print streams setting and profiles" << endl; + cout << " -isp IDX, set profile index of the image stream" << endl; + cout << " -dsp IDX, set profile index of the depth stream" << endl; + cout << " -ir, show data from IR stream" << endl; + cout << " -imb VAL, set brighness value for a image stream" << endl; + cout << " -imc VAL, set contrast value for a image stream" << endl; + cout << " -pts, print frame index and frame time" << endl; + cout << " --show-closed, print frame index and frame time" << endl; + cout << endl; +} + +static void parseCMDLine(int argc, char* argv[]) +{ + if( argc == 1 ) + { + printUsage(argv[0]); + } + else + { + for( int i = 1; i < argc; i++ ) + { + if ((0 == strcmp(argv[i], "--help")) || (0 == strcmp( argv[i], "-h"))) + { + printUsage(argv[0]); + exit(0); + } + else if ((0 == strcmp( argv[i], "--print-streams")) || (0 == strcmp( argv[i], "-ps"))) + { + g_printStreamSetting = true; + } + else if ((0 == strcmp( argv[i], "--image-stream-prof")) || (0 == strcmp( argv[i], "-isp"))) + { + g_imageStreamProfileIdx = atoi(argv[++i]); + } + else if ((0 == strcmp( argv[i], "--depth-stream-prof")) || (0 == strcmp( argv[i], "-dsp"))) + { + g_depthStreamProfileIdx = atoi(argv[++i]); + } + else if (0 == strcmp( argv[i], "-ir")) + { + g_irStreamShow = true; + } + else if (0 == strcmp( argv[i], "-imb")) + { + g_imageBrightness = atof(argv[++i]); + } + else if (0 == strcmp( argv[i], "-imc")) + { + g_imageContrast = atof(argv[++i]); + } + else if (0 == strcmp(argv[i], "-pts")) + { + g_printTiming = true; + } + else if (0 == strcmp(argv[i], "--show-closed")) + { + g_showClosedPoint = true; + } + else + { + cout << "Unsupported command line argument: " << argv[i] << "." << endl; + exit(-1); + } + } + if (g_showClosedPoint && (-1 == g_depthStreamProfileIdx)) + { + cerr << "For --show-closed depth profile has be selected" << endl; + exit(-1); + } + } +} + +static void printStreamProperties(VideoCapture &capture) +{ + size_t profilesCount = (size_t)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT); + cout << "Image stream." << endl; + cout << " Brightness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS) << endl; + cout << " Contrast = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_CONTRAST) << endl; + cout << " Saturation = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SATURATION) << endl; + cout << " Hue = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_HUE) << endl; + cout << " Gamma = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAMMA) << endl; + cout << " Sharpness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SHARPNESS) << endl; + cout << " Gain = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAIN) << endl; + cout << " Backligh = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BACKLIGHT) << endl; + cout << "Image streams profiles:" << endl; + for (size_t i = 0; i < profilesCount; i++) + { + capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); + cout << " Profile[" << i << "]: "; + cout << "width = " << + (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_WIDTH); + cout << ", height = " << + (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_HEIGHT); + cout << ", fps = " << + capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FPS); + cout << endl; + } + + profilesCount = (size_t)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT); + cout << "Depth stream." << endl; + cout << " Low confidence value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl; + cout << " Saturation value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl; + cout << " Confidence threshold = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl; + cout << " Focal length = (" << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", " + << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl; + cout << "Depth streams profiles:" << endl; + for (size_t i = 0; i < profilesCount; i++) + { + capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); + cout << " Profile[" << i << "]: "; + cout << "width = " << + (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_WIDTH); + cout << ", height = " << + (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_HEIGHT); + cout << ", fps = " << + capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FPS); + cout << endl; + } +} + +static void imshowImage(const char *winname, Mat &image, VideoCapture &capture) +{ + if (g_showClosedPoint) + { + Mat uvMap; + if (capture.retrieve(uvMap, CAP_INTELPERC_UVDEPTH_MAP)) + { + float *uvmap = (float *)uvMap.ptr() + 2 * (g_closedDepthPoint[0] * uvMap.cols + g_closedDepthPoint[1]); + int x = (int)((*uvmap) * image.cols); uvmap++; + int y = (int)((*uvmap) * image.rows); + + if ((0 <= x) && (0 <= y)) + { + static const int pointSize = 4; + for (int row = y; row < min(y + pointSize, image.rows); row++) + { + uchar* ptrDst = image.ptr(row) + x * 3 + 2;//+2 -> Red + for (int col = 0; col < min(pointSize, image.cols - x); col++, ptrDst+=3) + { + *ptrDst = 255; + } + } + } + } + } + imshow(winname, image); +} +static void imshowIR(const char *winname, Mat &ir) +{ + Mat image; + if (g_showClosedPoint) + { + image.create(ir.rows, ir.cols, CV_8UC3); + for (int row = 0; row < ir.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)ir.ptr(row); + for (int col = 0; col < ir.cols; col++, ptrSrc++) + { + uchar val = (uchar) ((*ptrSrc) >> 2); + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + } + } + + static const int pointSize = 4; + for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++) + { + uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red + for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3) + { + *ptrDst = 255; + } + } + } + else + { + image.create(ir.rows, ir.cols, CV_8UC1); + for (int row = 0; row < ir.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)ir.ptr(row); + for (int col = 0; col < ir.cols; col++, ptrSrc++, ptrDst++) + { + *ptrDst = (uchar) ((*ptrSrc) >> 2); + } + } + } + + imshow(winname, image); +} +static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture) +{ + short lowValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE); + short saturationValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE); + + Mat image; + if (g_showClosedPoint) + { + image.create(depth.rows, depth.cols, CV_8UC3); + for (int row = 0; row < depth.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)depth.ptr(row); + for (int col = 0; col < depth.cols; col++, ptrSrc++) + { + if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc))) + { + *ptrDst = 0; ptrDst++; + *ptrDst = 0; ptrDst++; + *ptrDst = 0; ptrDst++; + } + else + { + uchar val = (uchar) ((*ptrSrc) >> 2); + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + } + } + } + + static const int pointSize = 4; + for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++) + { + uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red + for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3) + { + *ptrDst = 255; + } + } + } + else + { + image.create(depth.rows, depth.cols, CV_8UC1); + for (int row = 0; row < depth.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)depth.ptr(row); + for (int col = 0; col < depth.cols; col++, ptrSrc++, ptrDst++) + { + if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc))) + *ptrDst = 0; + else + *ptrDst = (uchar) ((*ptrSrc) >> 2); + } + } + } + imshow(winname, image); +} + +int main(int argc, char* argv[]) +{ + parseCMDLine(argc, argv); + + VideoCapture capture; + capture.open(CAP_INTELPERC); + if (!capture.isOpened()) + { + cerr << "Can not open a capture object." << endl; + return -1; + } + + if (g_printStreamSetting) + printStreamProperties(capture); + + if (-1 != g_imageStreamProfileIdx) + { + if (!capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx)) + { + cerr << "Can not setup a image stream." << endl; + return -1; + } + } + if (-1 != g_depthStreamProfileIdx) + { + if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx)) + { + cerr << "Can not setup a depth stream." << endl; + return -1; + } + } + else if (g_irStreamShow) + { + if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, 0.0)) + { + cerr << "Can not setup a IR stream." << endl; + return -1; + } + } + else + { + cout << "Streams not selected" << endl; + return 0; + } + + //Setup additional properies only after set profile of the stream + if ( (-10000.0 < g_imageBrightness) && (g_imageBrightness < 10000.0)) + capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageBrightness); + if ( (0 < g_imageContrast) && (g_imageContrast < 10000.0)) + capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageContrast); + + int frame = 0; + for(;;frame++) + { + Mat bgrImage; + Mat depthImage; + Mat irImage; + + if (!capture.grab()) + { + cout << "Can not grab images." << endl; + return -1; + } + + if ((-1 != g_depthStreamProfileIdx) && (capture.retrieve(depthImage, CAP_INTELPERC_DEPTH_MAP))) + { + if (g_showClosedPoint) + { + double minVal = 0.0; double maxVal = 0.0; + minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint); + } + imshowDepth("depth image", depthImage, capture); + } + if ((g_irStreamShow) && (capture.retrieve(irImage, CAP_INTELPERC_IR_MAP))) + imshowIR("ir image", irImage); + if ((-1 != g_imageStreamProfileIdx) && (capture.retrieve(bgrImage, CAP_INTELPERC_IMAGE))) + imshowImage("color image", bgrImage, capture); + + if (g_printTiming) + { + cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_FRAMES) + << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_FRAMES) << endl; + cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_MSEC) + << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_MSEC) << endl; + } + if( waitKey(30) >= 0 ) + break; + } + + return 0; +} diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp index 480229b53..1c8dbd24a 100644 --- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp +++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp @@ -32,13 +32,13 @@ int main() for (int i = 0; i < image.rows; ++i) for (int j = 0; j < image.cols; ++j) { - Mat sampleMat = (Mat_(1,2) << i,j); + Mat sampleMat = (Mat_(1,2) << j,i); float response = SVM.predict(sampleMat); if (response == 1) - image.at(j, i) = green; + image.at(i,j) = green; else if (response == -1) - image.at(j, i) = blue; + image.at(i,j) = blue; } // Show the training data diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp index fd570b515..c5059323e 100644 --- a/samples/ocl/facedetect.cpp +++ b/samples/ocl/facedetect.cpp @@ -14,7 +14,10 @@ using namespace std; using namespace cv; + #define LOOP_NUM 1 +#define MAX_THREADS 10 + ///////////////////////////single-threading faces detecting/////////////////////////////// @@ -29,23 +32,23 @@ const static Scalar colors[] = { CV_RGB(0,0,255), } ; -int64 work_begin = 0; -int64 work_end = 0; +int64 work_begin[MAX_THREADS] = {0}; +int64 work_total[MAX_THREADS] = {0}; string inputName, outputName, cascadeName; -static void workBegin() +static void workBegin(int i = 0) { - work_begin = getTickCount(); + work_begin[i] = getTickCount(); } -static void workEnd() +static void workEnd(int i = 0) { - work_end += (getTickCount() - work_begin); + work_total[i] += (getTickCount() - work_begin[i]); } -static double getTime() +static double getTotalTime(int i = 0) { - return work_end /((double)cvGetTickFrequency() * 1000.); + return work_total[i] /getTickFrequency() * 1000.; } @@ -98,7 +101,6 @@ static int facedetect_one_thread(bool useCPU, double scale ) } } - cvNamedWindow( "result", 1 ); if( capture ) { cout << "In capture ..." << endl; @@ -118,7 +120,6 @@ static int facedetect_one_thread(bool useCPU, double scale ) else resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR); - work_end = 0; if(useCPU) detectCPU(frameCopy, faces, cpu_cascade, 1); else @@ -132,16 +133,16 @@ static int facedetect_one_thread(bool useCPU, double scale ) } else { - cout << "In image read" << endl; + cout << "In image read " << image.size() << endl; vector faces; vector ref_rst; double accuracy = 0.; detectCPU(image, ref_rst, cpu_cascade, scale); - work_end = 0; + cout << "loops: "; for(int i = 0; i <= LOOP_NUM; i ++) { - cout << "loop" << i << endl; + cout << i << ", "; if(useCPU) detectCPU(image, faces, cpu_cascade, scale); else @@ -152,16 +153,15 @@ static int facedetect_one_thread(bool useCPU, double scale ) accuracy = checkRectSimilarity(image.size(), ref_rst, faces); } } - if (i == LOOP_NUM) - { - if (useCPU) - cout << "average CPU time (noCamera) : "; - else - cout << "average GPU time (noCamera) : "; - cout << getTime() / LOOP_NUM << " ms" << endl; - cout << "accuracy value: " << accuracy <= 1700) -#define MAX_THREADS 10 - -static void detectFaces(std::string fileName) +static void detectFaces(std::string fileName, int threadNum) { ocl::OclCascadeClassifier cascade; if(!cascade.load(cascadeName)) @@ -188,7 +186,7 @@ static void detectFaces(std::string fileName) Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR); if (img.empty()) { - std::cout << "cann't open file " + fileName < oclfaces; - cascade.detectMultiScale(d_img, oclfaces, 1.1, 3, 0 | CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0)); + std::thread::id tid = std::this_thread::get_id(); + std::cout << '[' << threadNum << "] " + << "ThreadID = " << tid + << ", CommandQueue = " << *(void**)ocl::getClCommandQueuePtr() + << endl; + for(int i = 0; i <= LOOP_NUM; i++) + { + if(i>0) workBegin(threadNum); + cascade.detectMultiScale(d_img, oclfaces, 1.1, 3, 0|CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0)); + if(i>0) workEnd(threadNum); + } + std::cout << '[' << threadNum << "] " << "Average time = " << getTotalTime(threadNum) / LOOP_NUM << " ms" << endl; for(unsigned int i = 0; i threads; for(int i = 0; i= 1 }"; CommandLineParser cmd(argc, argv, keys); @@ -312,8 +323,6 @@ void detectCPU( Mat& img, vector& faces, void Draw(Mat& img, vector& faces, double scale) { int i = 0; - putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50), - FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3); for( vector::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) { Point center; @@ -324,8 +333,8 @@ void Draw(Mat& img, vector& faces, double scale) radius = cvRound((r->width + r->height)*0.25*scale); circle( img, center, radius, color, 3, 8, 0 ); } - //imwrite( outputName, img ); - if(abs(scale-1.0)>.001) + //if( !outputName.empty() ) imwrite( outputName, img ); + if( abs(scale-1.0)>.001 ) { resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale))); }