diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index d5e1da297..f8c15920a 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -1,78 +1,138 @@ if(APPLE) - set(OPENCL_FOUND YES) - set(OPENCL_LIBRARIES "-framework OpenCL") + set(OPENCL_FOUND YES) + set(OPENCL_LIBRARIES "-framework OpenCL") else() - #find_package(OpenCL QUIET) - if(WITH_OPENCLAMDFFT) - find_path(CLAMDFFT_INCLUDE_DIR - NAMES clAmdFft.h) - find_library(CLAMDFFT_LIBRARIES - NAMES clAmdFft.Runtime) + find_package(OpenCL QUIET) + if(WITH_OPENCLAMDFFT) + set(CLAMDFFT_SEARCH_PATH $ENV{CLAMDFFT_PATH}) + if(NOT CLAMDFFT_SEARCH_PATH) + if(WIN32) + set( CLAMDFFT_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdFft" ) + endif() endif() - if(WITH_OPENCLAMDBLAS) - find_path(CLAMDBLAS_INCLUDE_DIR - NAMES clAmdBlas.h) - find_library(CLAMDBLAS_LIBRARIES - NAMES clAmdBlas) - endif() - # Try AMD/ATI Stream SDK - if (NOT OPENCL_FOUND) - set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT}) - set(ENV_OPENCLROOT $ENV{OPENCLROOT}) - set(ENV_CUDA_PATH $ENV{CUDA_PATH}) - if(ENV_AMDSTREAMSDKROOT) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64) - endif() - elseif(ENV_CUDA_PATH AND WIN32) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64) - endif() - elseif(ENV_OPENCLROOT AND UNIX) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64) - endif() - endif() - - if(OPENCL_INCLUDE_SEARCH_PATH) - find_path(OPENCL_INCLUDE_DIR - NAMES CL/cl.h OpenCL/cl.h - PATHS ${OPENCL_INCLUDE_SEARCH_PATH} - NO_DEFAULT_PATH) - else() - find_path(OPENCL_INCLUDE_DIR - NAMES CL/cl.h OpenCL/cl.h) - endif() - - if(OPENCL_LIB_SEARCH_PATH) - find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH) - else() - find_library(OPENCL_LIBRARY NAMES OpenCL) - endif() - - include(FindPackageHandleStandardArgs) - find_package_handle_standard_args( - OPENCL - DEFAULT_MSG - OPENCL_LIBRARY OPENCL_INCLUDE_DIR - ) - - if(OPENCL_FOUND) - set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) - set(HAVE_OPENCL 1) - else() - set(OPENCL_LIBRARIES) - endif() + set( CLAMDFFT_INCLUDE_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}/include ) + if(UNIX) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib) + else() + set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib64) + endif() else() - set(HAVE_OPENCL 1) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib32\\import) + else() + set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib64\\import) + endif() endif() + find_path(CLAMDFFT_INCLUDE_DIR + NAMES clAmdFft.h + PATHS ${CLAMDFFT_INCLUDE_SEARCH_PATH} + PATH_SUFFIXES clAmdFft + NO_DEFAULT_PATH) + find_library(CLAMDFFT_LIBRARY + NAMES clAmdFft.Runtime + PATHS ${CLAMDFFT_LIB_SEARCH_PATH} + NO_DEFAULT_PATH) + if(CLAMDFFT_LIBRARY) + set(CLAMDFFT_LIBRARIES ${CLAMDFFT_LIBRARY}) + else() + set(CLAMDFFT_LIBRARIES "") + endif() + endif() + if(WITH_OPENCLAMDBLAS) + set(CLAMDBLAS_SEARCH_PATH $ENV{CLAMDBLAS_PATH}) + if(NOT CLAMDBLAS_SEARCH_PATH) + if(WIN32) + set( CLAMDBLAS_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdBlas" ) + endif() + endif() + set( CLAMDBLAS_INCLUDE_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}/include ) + if(UNIX) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib) + else() + set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib64) + endif() + else() + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib32\\import) + else() + set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib64\\import) + endif() + endif() + find_path(CLAMDBLAS_INCLUDE_DIR + NAMES clAmdBlas.h + PATHS ${CLAMDBLAS_INCLUDE_SEARCH_PATH} + PATH_SUFFIXES clAmdBlas + NO_DEFAULT_PATH) + find_library(CLAMDBLAS_LIBRARY + NAMES clAmdBlas + PATHS ${CLAMDBLAS_LIB_SEARCH_PATH} + NO_DEFAULT_PATH) + if(CLAMDBLAS_LIBRARY) + set(CLAMDBLAS_LIBRARIES ${CLAMDBLAS_LIBRARY}) + else() + set(CLAMDBLAS_LIBRARIES "") + endif() + endif() + # Try AMD/ATI Stream SDK + if (NOT OPENCL_FOUND) + set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT}) + set(ENV_OPENCLROOT $ENV{OPENCLROOT}) + set(ENV_CUDA_PATH $ENV{CUDA_PATH}) + if(ENV_AMDSTREAMSDKROOT) + set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86) + else() + set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64) + endif() + elseif(ENV_CUDA_PATH AND WIN32) + set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32) + else() + set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64) + endif() + elseif(ENV_OPENCLROOT AND UNIX) + set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib) + else() + set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64) + endif() + endif() + + if(OPENCL_INCLUDE_SEARCH_PATH) + find_path(OPENCL_INCLUDE_DIR + NAMES CL/cl.h OpenCL/cl.h + PATHS ${OPENCL_INCLUDE_SEARCH_PATH} + NO_DEFAULT_PATH) + else() + find_path(OPENCL_INCLUDE_DIR + NAMES CL/cl.h OpenCL/cl.h) + endif() + + if(OPENCL_LIB_SEARCH_PATH) + find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH) + else() + find_library(OPENCL_LIBRARY NAMES OpenCL) + endif() + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args( + OPENCL + DEFAULT_MSG + OPENCL_LIBRARY OPENCL_INCLUDE_DIR + ) + + if(OPENCL_FOUND) + set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) + set(HAVE_OPENCL 1) + else() + set(OPENCL_LIBRARIES) + endif() + else() + set(HAVE_OPENCL 1) + endif() endif() diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 4a56cff20..5e1288915 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -41,8 +41,8 @@ // //M*/ -#ifndef __OPENCV_GPU_HPP__ -#define __OPENCV_GPU_HPP__ +#ifndef __OPENCV_OCL_HPP__ +#define __OPENCV_OCL_HPP__ #include #include diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 6e60da33e..18fed6e8c 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -47,6 +47,7 @@ //M*/ #include "precomp.hpp" +#include "mcwutil.hpp" #include using namespace std; using namespace cv; @@ -109,7 +110,7 @@ Ptr cv::ocl::createLinearFilter_GPU(int, int, const Mat &, con return Ptr(0); } -Ptr cv::ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType ) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) { throw_nogpu(); return Ptr(0); @@ -150,17 +151,17 @@ void cv::ocl::Laplacian(const oclMat &, oclMat &, int, int, double) throw_nogpu(); } -void cv::ocl::erode( const oclMat &, oclMat &, const Mat &, Point, int) +void cv::ocl::erode(const oclMat &, oclMat &, const Mat &, Point, int) { throw_nogpu(); } -void cv::ocl::dilate( const oclMat &, oclMat &, const Mat &, Point, int) +void cv::ocl::dilate(const oclMat &, oclMat &, const Mat &, Point, int) { throw_nogpu(); } -void cv::ocl::morphologyEx( const oclMat &, oclMat &, int, const Mat &, Point, int) +void cv::ocl::morphologyEx(const oclMat &, oclMat &, int, const Mat &, Point, int) { throw_nogpu(); } @@ -170,100 +171,110 @@ void cv::ocl::morphologyEx( const oclMat &, oclMat &, int, const Mat &, Point, i //helper routines namespace cv { - namespace ocl - { - ///////////////////////////OpenCL kernel strings/////////////////////////// - extern const char *filtering_boxFilter; - extern const char *filter_sep_row; - extern const char *filter_sep_col; - extern const char *filtering_laplacian; - extern const char *filtering_morph; - } +namespace ocl +{ +///////////////////////////OpenCL kernel strings/////////////////////////// +extern const char *filtering_boxFilter; +extern const char *filter_sep_row; +extern const char *filter_sep_col; +extern const char *filtering_laplacian; +extern const char *filtering_morph; +} } namespace { - inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain; - } +inline int divUp(int total, int grain) +{ + return (total + grain - 1) / grain; +} } namespace { - inline void normalizeAnchor(int &anchor, int ksize) +inline void normalizeAnchor(int &anchor, int ksize) +{ + if (anchor < 0) { - if (anchor < 0) - anchor = ksize >> 1; - - CV_Assert(0 <= anchor && anchor < ksize); + anchor = ksize >> 1; } - inline void normalizeAnchor(Point &anchor, const Size &ksize) + CV_Assert(0 <= anchor && anchor < ksize); +} + +inline void normalizeAnchor(Point &anchor, const Size &ksize) +{ + normalizeAnchor(anchor.x, ksize.width); + normalizeAnchor(anchor.y, ksize.height); +} + +inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size) +{ + if (roi == Rect(0, 0, -1, -1)) { - normalizeAnchor(anchor.x, ksize.width); - normalizeAnchor(anchor.y, ksize.height); + roi = Rect(0, 0, src_size.width, src_size.height); } - inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size) + CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); + CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); + CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height); +} + + +inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8U, int *nDivisor = 0, bool reverse = false) +{ + int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1; + + if (nDivisor) { - if (roi == Rect(0, 0, -1, -1)) - roi = Rect(0, 0, src_size.width, src_size.height); - CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); - CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); - CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height); + *nDivisor = scale; } + Mat temp(kernel.size(), type); + kernel.convertTo(temp, type, scale); + Mat cont_krnl = temp.reshape(1, 1); - inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8U, int *nDivisor = 0, bool reverse = false) + if (reverse) { - int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1; - if (nDivisor) *nDivisor = scale; + int count = cont_krnl.cols >> 1; - Mat temp(kernel.size(), type); - kernel.convertTo(temp, type, scale); - Mat cont_krnl = temp.reshape(1, 1); - - if (reverse) + for (int i = 0; i < count; ++i) { - int count = cont_krnl.cols >> 1; - for (int i = 0; i < count; ++i) - { - std::swap(cont_krnl.at(0, i), cont_krnl.at(0, cont_krnl.cols - 1 - i)); - } + std::swap(cont_krnl.at(0, i), cont_krnl.at(0, cont_krnl.cols - 1 - i)); } - - gpu_krnl.upload(cont_krnl); } + + gpu_krnl.upload(cont_krnl); +} } //////////////////////////////////////////////////////////////////////////////////////////////////// // Filter2D namespace { - class Filter2DEngine_GPU : public FilterEngine_GPU +class Filter2DEngine_GPU : public FilterEngine_GPU +{ +public: + Filter2DEngine_GPU(const Ptr &filter2D_) : filter2D(filter2D_) {} + + virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) { - public: - Filter2DEngine_GPU(const Ptr &filter2D_) : filter2D(filter2D_) {} + Size src_size = src.size(); - virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) - { - Size src_size = src.size(); + // Delete those two clause below which exist before, However, the result is alos correct + // dst.create(src_size, src.type()); + // dst = Scalar(0.0); - // Delete those two clause below which exist before, However, the result is alos correct - // dst.create(src_size, src.type()); - // dst = Scalar(0.0); + normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); - normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); + oclMat srcROI = src(roi); + oclMat dstROI = dst(roi); - oclMat srcROI = src(roi); - oclMat dstROI = dst(roi); + (*filter2D)(srcROI, dstROI); + } - (*filter2D)(srcROI, dstROI); - } - - Ptr filter2D; - }; + Ptr filter2D; +}; } Ptr cv::ocl::createFilter2D_GPU(const Ptr filter2D) @@ -275,22 +286,22 @@ Ptr cv::ocl::createFilter2D_GPU(const Ptr filt // Box Filter namespace { - typedef void (*FilterBox_t)(const oclMat & , oclMat & , Size &, const Point, const int); +typedef void (*FilterBox_t)(const oclMat & , oclMat & , Size &, const Point, const int); - class GPUBoxFilter : public BaseFilter_GPU +class GPUBoxFilter : public BaseFilter_GPU +{ +public: + GPUBoxFilter(const Size &ksize_, const Point &anchor_, const int borderType_, FilterBox_t func_) : + BaseFilter_GPU(ksize_, anchor_, borderType_), func(func_) {} + + virtual void operator()(const oclMat &src, oclMat &dst) { - public: - GPUBoxFilter(const Size &ksize_, const Point &anchor_, const int borderType_, FilterBox_t func_) : - BaseFilter_GPU(ksize_, anchor_, borderType_), func(func_) {} + func(src, dst, ksize, anchor, borderType); + } - virtual void operator()(const oclMat &src, oclMat &dst) - { - func(src, dst, ksize, anchor, borderType); - } + FilterBox_t func; - FilterBox_t func; - - }; +}; } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -298,22 +309,22 @@ namespace namespace { - typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point); +typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point); - class MorphFilter_GPU : public BaseFilter_GPU +class MorphFilter_GPU : public BaseFilter_GPU +{ +public: + MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUMorfFilter_t func_) : + BaseFilter_GPU(ksize_, anchor_, BORDER_CONSTANT), kernel(kernel_), func(func_) {} + + virtual void operator()(const oclMat &src, oclMat &dst) { - public: - MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUMorfFilter_t func_) : - BaseFilter_GPU(ksize_, anchor_, BORDER_CONSTANT), kernel(kernel_), func(func_) {} + func(src, dst, kernel, ksize, anchor) ; + } - virtual void operator()(const oclMat &src, oclMat &dst) - { - func(src, dst, kernel, ksize, anchor) ; - } - - oclMat kernel; - GPUMorfFilter_t func; - }; + oclMat kernel; + GPUMorfFilter_t func; +}; } /* @@ -326,9 +337,9 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c //Normalize the result by default //float alpha = ksize.height * ksize.width; CV_Assert(src.clCxt == dst.clCxt); - CV_Assert( (src.cols == dst.cols) && - (src.rows == dst.rows) ); - CV_Assert( (src.oclchannels() == dst.oclchannels()) ); + CV_Assert((src.cols == dst.cols) && + (src.rows == dst.rows)); + CV_Assert((src.oclchannels() == dst.oclchannels())); int srcStep = src.step1() / src.oclchannels(); int dstStep = dst.step1() / dst.oclchannels(); @@ -342,19 +353,21 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1}; - if(src.type() == CV_8UC1) + if (src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } else { kernelName = "morph"; - CV_Assert( localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } + char s[64]; - switch(src.type()) + + switch (src.type()) { case CV_8UC1: sprintf(s, "-D VAL=255"); @@ -373,21 +386,22 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c default: CV_Error(CV_StsUnsupportedFormat, "unsupported type"); } + char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s", anchor.x, anchor.y, localThreads[0], localThreads[1], s); vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } @@ -398,9 +412,9 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, //Normalize the result by default //float alpha = ksize.height * ksize.width; CV_Assert(src.clCxt == dst.clCxt); - CV_Assert( (src.cols == dst.cols) && - (src.rows == dst.rows) ); - CV_Assert( (src.oclchannels() == dst.oclchannels()) ); + CV_Assert((src.cols == dst.cols) && + (src.rows == dst.rows)); + CV_Assert((src.oclchannels() == dst.oclchannels())); int srcStep = src.step1() / src.oclchannels(); int dstStep = dst.step1() / dst.oclchannels(); @@ -414,19 +428,21 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] *localThreads[0], (src.rows + localThreads[1]) / localThreads[1] *localThreads[1], 1}; - if(src.type() == CV_8UC1) + if (src.type() == CV_8UC1) { kernelName = "morph_C1_D0"; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; - CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } else { kernelName = "morph"; - CV_Assert( localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); + CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)); } + char s[64]; - switch(src.type()) + + switch (src.type()) { case CV_8UC1: sprintf(s, "-D VAL=0"); @@ -445,21 +461,22 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, default: CV_Error(CV_StsUnsupportedFormat, "unsupported type"); } + char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s", anchor.x, anchor.y, localThreads[0], localThreads[1], s); vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset)); openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } @@ -483,47 +500,48 @@ Ptr cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat namespace { - class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU +class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU +{ +public: + MorphologyFilterEngine_GPU(const Ptr &filter2D_, int iters_) : + Filter2DEngine_GPU(filter2D_), iters(iters_) {} + + virtual void apply(const oclMat &src, oclMat &dst) { - public: - MorphologyFilterEngine_GPU(const Ptr &filter2D_, int iters_) : - Filter2DEngine_GPU(filter2D_), iters(iters_) {} + Filter2DEngine_GPU::apply(src, dst); - virtual void apply(const oclMat &src, oclMat &dst) + //if (iters > 1) + //{ + // Size wholesize; + // Point ofs; + // dst.locateROI(wholesize,ofs); + // int rows = dst.rows, cols = dst.cols; + // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols); + // dst.copyTo(morfBuf); + // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols); + // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols); + // //morfBuf.create(src.size(),src.type()); + // //Filter2DEngine_GPU::apply(dst, morfBuf); + // //morfBuf.copyTo(dst); + //} + for (int i = 1; i < iters; ++i) { - Filter2DEngine_GPU::apply(src, dst); - //if (iters > 1) - //{ - // Size wholesize; - // Point ofs; - // dst.locateROI(wholesize,ofs); - // int rows = dst.rows, cols = dst.cols; - // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols); - // dst.copyTo(morfBuf); - // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols); - // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols); - // //morfBuf.create(src.size(),src.type()); - // //Filter2DEngine_GPU::apply(dst, morfBuf); - // //morfBuf.copyTo(dst); - //} - for(int i = 1; i < iters; ++i) - { - //dst.swap(morfBuf); - Size wholesize; - Point ofs; - dst.locateROI(wholesize, ofs); - int rows = dst.rows, cols = dst.cols; - dst.adjustROI(ofs.y, -ofs.y - rows + dst.wholerows, ofs.x, -ofs.x - cols + dst.wholecols); - dst.copyTo(morfBuf); - dst.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols); - morfBuf.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols); - Filter2DEngine_GPU::apply(morfBuf, dst); - } + //dst.swap(morfBuf); + Size wholesize; + Point ofs; + dst.locateROI(wholesize, ofs); + int rows = dst.rows, cols = dst.cols; + dst.adjustROI(ofs.y, -ofs.y - rows + dst.wholerows, ofs.x, -ofs.x - cols + dst.wholecols); + dst.copyTo(morfBuf); + dst.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols); + morfBuf.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols); + Filter2DEngine_GPU::apply(morfBuf, dst); } + } - int iters; - oclMat morfBuf; - }; + int iters; + oclMat morfBuf; +}; } Ptr cv::ocl::createMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Point &anchor, int iterations) @@ -539,104 +557,113 @@ Ptr cv::ocl::createMorphologyFilter_GPU(int op, int type, cons namespace { - void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) +void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue) +{ + if ((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue())) { - if((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue())) - { - CV_Error(CV_StsBadArg, "unsupported border type"); - } - Mat kernel; - Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); - - normalizeAnchor(anchor, ksize); - - if (iterations == 0 || _kernel.rows * _kernel.cols == 1) - { - src.copyTo(dst); - return; - } - - dst.create(src.size(), src.type()); - - if (!_kernel.data) - { - kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2)); - anchor = Point(iterations, iterations); - iterations = 1; - } - else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols) - { - anchor = Point(anchor.x * iterations, anchor.y * iterations); - kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + iterations * (ksize.width - 1), - ksize.height + iterations * (ksize.height - 1)), anchor); - iterations = 1; - } - else - kernel = _kernel; - - Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); - - f->apply(src, dst); + CV_Error(CV_StsBadArg, "unsupported border type"); } + + Mat kernel; + Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); + + normalizeAnchor(anchor, ksize); + + if (iterations == 0 || _kernel.rows *_kernel.cols == 1) + { + src.copyTo(dst); + return; + } + + dst.create(src.size(), src.type()); + + if (!_kernel.data) + { + kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2)); + anchor = Point(iterations, iterations); + iterations = 1; + } + else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols) + { + anchor = Point(anchor.x * iterations, anchor.y * iterations); + kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + iterations * (ksize.width - 1), + ksize.height + iterations * (ksize.height - 1)), anchor); + iterations = 1; + } + else + { + kernel = _kernel; + } + + Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); + + f->apply(src, dst); +} } -void cv::ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, - int borderType, const Scalar &borderValue) +void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, + int borderType, const Scalar &borderValue) { bool allZero = true; - for(int i = 0; i < kernel.rows * kernel.cols; ++i) - if(kernel.data[i] != 0) + + for (int i = 0; i < kernel.rows * kernel.cols; ++i) + if (kernel.data[i] != 0) + { allZero = false; - if(allZero) + } + + if (allZero) { kernel.data[0] = 1; } + morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue); } -void cv::ocl::dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, - int borderType, const Scalar &borderValue) +void cv::ocl::dilate(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, + int borderType, const Scalar &borderValue) { morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue); } -void cv::ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations, - int borderType, const Scalar &borderValue) +void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations, + int borderType, const Scalar &borderValue) { oclMat temp; - switch( op ) + + switch (op) { case MORPH_ERODE: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue); + erode(src, dst, kernel, anchor, iterations, borderType, borderValue); break; case MORPH_DILATE: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue); + dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); break; case MORPH_OPEN: - erode( src, temp, kernel, anchor, iterations, borderType, borderValue); - dilate( temp, dst, kernel, anchor, iterations, borderType, borderValue); + erode(src, temp, kernel, anchor, iterations, borderType, borderValue); + dilate(temp, dst, kernel, anchor, iterations, borderType, borderValue); break; case CV_MOP_CLOSE: - dilate( src, temp, kernel, anchor, iterations, borderType, borderValue); - erode( temp, dst, kernel, anchor, iterations, borderType, borderValue); + dilate(src, temp, kernel, anchor, iterations, borderType, borderValue); + erode(temp, dst, kernel, anchor, iterations, borderType, borderValue); break; case CV_MOP_GRADIENT: - erode( src, temp, kernel, anchor, iterations, borderType, borderValue); - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue); + erode(src, temp, kernel, anchor, iterations, borderType, borderValue); + dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); subtract(dst, temp, dst); break; case CV_MOP_TOPHAT: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue); - dilate( dst, temp, kernel, anchor, iterations, borderType, borderValue); + erode(src, dst, kernel, anchor, iterations, borderType, borderValue); + dilate(dst, temp, kernel, anchor, iterations, borderType, borderValue); subtract(src, temp, dst); break; case CV_MOP_BLACKHAT: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue); - erode( dst, temp, kernel, anchor, iterations, borderType, borderValue); + dilate(src, dst, kernel, anchor, iterations, borderType, borderValue); + erode(dst, temp, kernel, anchor, iterations, borderType, borderValue); subtract(temp, src, dst); break; default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); + CV_Error(CV_StsBadArg, "unknown morphological operation"); } } @@ -645,33 +672,33 @@ void cv::ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &k namespace { - typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, const int); +typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, const int); - class LinearFilter_GPU : public BaseFilter_GPU +class LinearFilter_GPU : public BaseFilter_GPU +{ +public: + LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUFilter2D_t func_, + int borderType_) : + BaseFilter_GPU(ksize_, anchor_, borderType_), kernel(kernel_), func(func_) {} + + virtual void operator()(const oclMat &src, oclMat &dst) { - public: - LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUFilter2D_t func_, - int borderType_) : - BaseFilter_GPU(ksize_, anchor_, borderType_), kernel(kernel_), func(func_) {} + func(src, dst, kernel, ksize, anchor, borderType) ; + } - virtual void operator()(const oclMat &src, oclMat &dst) - { - func(src, dst, kernel, ksize, anchor, borderType) ; - } - - oclMat kernel; - GPUFilter2D_t func; - }; + oclMat kernel; + GPUFilter2D_t func; +}; } void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, const Point anchor, const int borderType) { CV_Assert(src.clCxt == dst.clCxt); - CV_Assert( (src.cols == dst.cols) && - (src.rows == dst.rows) ); - CV_Assert( (src.oclchannels() == dst.oclchannels()) ); - CV_Assert( (borderType != 0) ); + CV_Assert((src.cols == dst.cols) && + (src.rows == dst.rows)); + CV_Assert((src.oclchannels() == dst.oclchannels())); + CV_Assert((borderType != 0)); CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); Context *clCxt = src.clCxt; @@ -703,20 +730,20 @@ void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel, }; vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset_y)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_x)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_y)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x)); + args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y)); + args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(make_pair(sizeof(cl_int), (void *)&cols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols)); + args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows)); openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth); } @@ -750,8 +777,10 @@ Ptr cv::ocl::createLinearFilter_GPU(int srcType, int dstType, void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType) { - if( ddepth < 0 ) + if (ddepth < 0) + { ddepth = src.depth(); + } dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); @@ -764,50 +793,50 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke namespace { - class SeparableFilterEngine_GPU : public FilterEngine_GPU +class SeparableFilterEngine_GPU : public FilterEngine_GPU +{ +public: + SeparableFilterEngine_GPU(const Ptr &rowFilter_, + const Ptr &columnFilter_) : + rowFilter(rowFilter_), columnFilter(columnFilter_) { - public: - SeparableFilterEngine_GPU(const Ptr &rowFilter_, - const Ptr &columnFilter_) : - rowFilter(rowFilter_), columnFilter(columnFilter_) - { - ksize = Size(rowFilter->ksize, columnFilter->ksize); - anchor = Point(rowFilter->anchor, columnFilter->anchor); - } + ksize = Size(rowFilter->ksize, columnFilter->ksize); + anchor = Point(rowFilter->anchor, columnFilter->anchor); + } - virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) - { - Size src_size = src.size(); - //int src_type = src.type(); + virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) + { + Size src_size = src.size(); + //int src_type = src.type(); - int cn = src.oclchannels(); - //dst.create(src_size, src_type); - dst = Scalar(0.0); - //dstBuf.create(src_size, src_type); - dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn)); - dstBuf = Scalar(0.0); + int cn = src.oclchannels(); + //dst.create(src_size, src_type); + //dst = Scalar(0.0); + //dstBuf.create(src_size, src_type); + dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn)); + //dstBuf = Scalar(0.0); - normalizeROI(roi, ksize, anchor, src_size); + normalizeROI(roi, ksize, anchor, src_size); - srcROI = src(roi); - dstROI = dst(roi); - //dstBufROI = dstBuf(roi); + srcROI = src(roi); + dstROI = dst(roi); + //dstBufROI = dstBuf(roi); - (*rowFilter)(srcROI, dstBuf); - //Mat rm(dstBufROI); - //std::cout << "rm " << rm << endl; - (*columnFilter)(dstBuf, dstROI); - } + (*rowFilter)(srcROI, dstBuf); + //Mat rm(dstBufROI); + //std::cout << "rm " << rm << endl; + (*columnFilter)(dstBuf, dstROI); + } - Ptr rowFilter; - Ptr columnFilter; - Size ksize; - Point anchor; - oclMat dstBuf; - oclMat srcROI; - oclMat dstROI; - oclMat dstBufROI; - }; + Ptr rowFilter; + Ptr columnFilter; + Size ksize; + Point anchor; + oclMat dstBuf; + oclMat srcROI; + oclMat dstROI; + oclMat dstBufROI; +}; } Ptr cv::ocl::createSeparableFilter_GPU(const Ptr &rowFilter, @@ -829,13 +858,14 @@ void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C1_D0"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -890,13 +920,14 @@ void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C4_D0"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -951,13 +982,14 @@ void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C1_D5"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1013,13 +1045,14 @@ void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst, CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows) ); + (src.rows == dst.rows)); Context *clCxt = src.clCxt; string kernelName = "boxFilter_C4_D5"; char btype[30]; - switch(borderType) + + switch (borderType) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1095,8 +1128,11 @@ void cv::ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor, int borderType) { int sdepth = src.depth(), cn = src.channels(); - if( ddepth < 0 ) + + if (ddepth < 0) + { ddepth = sdepth; + } dst.create(src.size(), CV_MAKETYPE(ddepth, cn)); @@ -1107,22 +1143,22 @@ void cv::ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, namespace { - typedef void (*gpuFilter1D_t)(const oclMat &src, const oclMat &dst, oclMat kernel, int ksize, int anchor, int bordertype); +typedef void (*gpuFilter1D_t)(const oclMat &src, const oclMat &dst, oclMat kernel, int ksize, int anchor, int bordertype); - class GpuLinearRowFilter : public BaseRowFilter_GPU +class GpuLinearRowFilter : public BaseRowFilter_GPU +{ +public: + GpuLinearRowFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) : + BaseRowFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {} + + virtual void operator()(const oclMat &src, oclMat &dst) { - public: - GpuLinearRowFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) : - BaseRowFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {} + func(src, dst, kernel, ksize, anchor, bordertype); + } - virtual void operator()(const oclMat &src, oclMat &dst) - { - func(src, dst, kernel, ksize, anchor, bordertype); - } - - oclMat kernel; - gpuFilter1D_t func; - }; + oclMat kernel; + gpuFilter1D_t func; +}; } template struct index_and_sizeof; @@ -1161,7 +1197,8 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel string kernelName = "row_filter"; char btype[30]; - switch(bordertype) + + switch (bordertype) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1179,15 +1216,17 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel sprintf(btype, "BORDER_REFLECT_101"); break; } + char compile_option[128]; sprintf(compile_option, "-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, localThreads[0], localThreads[1], channels, btype); size_t globalThreads[3]; globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2]; - if(src.depth() == CV_8U) + + if (src.depth() == CV_8U) { - switch(channels) + switch (channels) { case 1: case 3: @@ -1205,6 +1244,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel { globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; } + //sanity checks CV_Assert(clCxt == dst.clCxt); CV_Assert(src.cols == dst.cols); @@ -1232,7 +1272,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); - openCLExecuteKernel(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option); + openCLExecuteKernel2(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option, CLFLUSH); } Ptr cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype) @@ -1263,20 +1303,20 @@ Ptr cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufTyp namespace { - class GpuLinearColumnFilter : public BaseColumnFilter_GPU +class GpuLinearColumnFilter : public BaseColumnFilter_GPU +{ +public: + GpuLinearColumnFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) : + BaseColumnFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {} + + virtual void operator()(const oclMat &src, oclMat &dst) { - public: - GpuLinearColumnFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) : - BaseColumnFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {} + func(src, dst, kernel, ksize, anchor, bordertype); + } - virtual void operator()(const oclMat &src, oclMat &dst) - { - func(src, dst, kernel, ksize, anchor, bordertype); - } - - oclMat kernel; - gpuFilter1D_t func; - }; + oclMat kernel; + gpuFilter1D_t func; +}; } template @@ -1289,7 +1329,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker string kernelName = "col_filter"; char btype[30]; - switch(bordertype) + + switch (bordertype) { case 0: sprintf(btype, "BORDER_CONSTANT"); @@ -1307,15 +1348,17 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker sprintf(btype, "BORDER_REFLECT_101"); break; } + char compile_option[256]; size_t globalThreads[3]; globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2]; - if(dst.depth() == CV_8U) + + if (dst.depth() == CV_8U) { - switch(channels) + switch (channels) { case 1: globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; @@ -1338,7 +1381,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker else { globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - switch(dst.type()) + + switch (dst.type()) { case CV_32SC1: sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", @@ -1441,20 +1485,25 @@ Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) { - if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi + if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi { - if((bordertype & cv::BORDER_ISOLATED) != 0) + if ((bordertype & cv::BORDER_ISOLATED) != 0) { bordertype &= ~cv::BORDER_ISOLATED; - if((bordertype != cv::BORDER_CONSTANT) && + + if ((bordertype != cv::BORDER_CONSTANT) && (bordertype != cv::BORDER_REPLICATE)) { CV_Error(CV_StsBadArg, "unsupported border type"); } } } - if( ddepth < 0 ) + + if (ddepth < 0) + { ddepth = src.depth(); + } + //CV_Assert(ddepth == src.depth()); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); @@ -1462,12 +1511,12 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat f->apply(src, dst); } -Ptr cv::ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType ) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) { Mat kx, ky; - getDerivKernels( kx, ky, dx, dy, ksize, false, CV_32F ); + getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); return createSeparableLinearFilter_GPU(srcType, dstType, - kx, ky, Point(-1, -1), 0, borderType ); + kx, ky, Point(-1, -1), 0, borderType); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1482,15 +1531,20 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, // usually the smoothing part is the slowest to compute, // so try to scale it instead of the faster differenciating part if (dx == 0) + { kx *= scale; + } else + { ky *= scale; + } } + // Mat kx_, ky_; //ky.convertTo(ky_,CV_32S,1<<8); //kx.convertTo(kx_,CV_32S,1<<8); - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType ); + sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType); } void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale, double delta , int bordertype) @@ -1498,14 +1552,18 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, Mat kx, ky; getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); - if( scale != 1 ) + if (scale != 1) { // usually the smoothing part is the slowest to compute, // so try to scale it instead of the faster differenciating part - if( dx == 0 ) + if (dx == 0) + { kx *= scale; + } else + { ky *= scale; + } } // Mat kx_, ky_; @@ -1517,7 +1575,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale) { - if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) + if (src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) { CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n"); return; @@ -1531,8 +1589,12 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d {2, 0, 2, 0, -8, 0, 2, 0, 2} }; Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]); + if (scale != 1) + { kernel *= scale; + } + filter2D(src, dst, ddepth, kernel, Point(-1, -1)); } @@ -1544,25 +1606,38 @@ Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do int depth = CV_MAT_DEPTH(type); if (sigma2 <= 0) + { sigma2 = sigma1; + } // automatic detection of kernel size from sigma if (ksize.width <= 0 && sigma1 > 0) + { ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; - if (ksize.height <= 0 && sigma2 > 0) - ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; + } - CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 ); + if (ksize.height <= 0 && sigma2 > 0) + { + ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1; + } + + CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1); sigma1 = std::max(sigma1, 0.0); sigma2 = std::max(sigma2, 0.0); - Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) ); + Mat kx = getGaussianKernel(ksize.width, sigma1, std::max(depth, CV_32F)); Mat ky; - if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) + + if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON) + { ky = kx; + } else - ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); + { + ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F)); + } + //Mat kx_, ky_; //kx.convertTo(kx_,CV_32S,1<<8); //ky.convertTo(ky_,CV_32S,1<<8); @@ -1576,26 +1651,36 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si src.copyTo(dst); return; } - if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi + + if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi { - if((bordertype & cv::BORDER_ISOLATED) != 0) + if ((bordertype & cv::BORDER_ISOLATED) != 0) { bordertype &= ~cv::BORDER_ISOLATED; - if((bordertype != cv::BORDER_CONSTANT) && + + if ((bordertype != cv::BORDER_CONSTANT) && (bordertype != cv::BORDER_REPLICATE)) { CV_Error(CV_StsBadArg, "unsupported border type"); } } } + dst.create(src.size(), src.type()); - if( bordertype != BORDER_CONSTANT ) + + if (bordertype != BORDER_CONSTANT) { - if( src.rows == 1 ) + if (src.rows == 1) + { ksize.height = 1; - if( src.cols == 1 ) + } + + if (src.cols == 1) + { ksize.width = 1; + } } + Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype); f->apply(src, dst); } diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 07489157e..492cd9197 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -63,13 +63,13 @@ using namespace std; namespace cv { - namespace ocl - { - ///////////////////////////OpenCL kernel strings/////////////////////////// - extern const char *haarobjectdetect; - extern const char *haarobjectdetectbackup; - extern const char *haarobjectdetect_scaled2; - } +namespace ocl +{ +///////////////////////////OpenCL kernel strings/////////////////////////// +extern const char *haarobjectdetect; +extern const char *haarobjectdetectbackup; +extern const char *haarobjectdetect_scaled2; +} } /* these settings affect the quality of detection: change with care */ @@ -883,13 +883,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; // bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0; - //the Intel HD Graphics is unsupported - if (gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos) - { - cout << " Intel HD GPU device unsupported " << endl; - return NULL; - } - //double t = 0; if( maxSize.height == 0 || maxSize.width == 0 ) { @@ -937,7 +930,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if( gimg.cols < minSize.width || gimg.rows < minSize.height ) CV_Error(CV_StsError, "Image too small"); - if( flags & CV_HAAR_SCALE_IMAGE ) + if( (flags & CV_HAAR_SCALE_IMAGE) && gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") == string::npos ) { CvSize winSize0 = cascade->orig_window_size; //float scalefactor = 1.1f; @@ -2170,41 +2163,41 @@ CvType haar_type( CV_TYPE_NAME_HAAR, gpuIsHaarClassifier, namespace cv { - HaarClassifierCascade::HaarClassifierCascade() {} - HaarClassifierCascade::HaarClassifierCascade(const String &filename) - { - load(filename); - } +HaarClassifierCascade::HaarClassifierCascade() {} +HaarClassifierCascade::HaarClassifierCascade(const String &filename) +{ + load(filename); +} - bool HaarClassifierCascade::load(const String &filename) - { - cascade = Ptr((CvHaarClassifierCascade *)cvLoad(filename.c_str(), 0, 0, 0)); - return (CvHaarClassifierCascade *)cascade != 0; - } +bool HaarClassifierCascade::load(const String &filename) +{ + cascade = Ptr((CvHaarClassifierCascade *)cvLoad(filename.c_str(), 0, 0, 0)); + return (CvHaarClassifierCascade *)cascade != 0; +} - void HaarClassifierCascade::detectMultiScale( const Mat &image, - Vector &objects, double scaleFactor, - int minNeighbors, int flags, - Size minSize ) - { - MemStorage storage(cvCreateMemStorage(0)); - CvMat _image = image; - CvSeq *_objects = gpuHaarDetectObjects( &_image, cascade, storage, scaleFactor, - minNeighbors, flags, minSize ); - Seq(_objects).copyTo(objects); - } +void HaarClassifierCascade::detectMultiScale( const Mat &image, + Vector &objects, double scaleFactor, + int minNeighbors, int flags, + Size minSize ) +{ + MemStorage storage(cvCreateMemStorage(0)); + CvMat _image = image; + CvSeq *_objects = gpuHaarDetectObjects( &_image, cascade, storage, scaleFactor, + minNeighbors, flags, minSize ); + Seq(_objects).copyTo(objects); +} - int HaarClassifierCascade::runAt(Point pt, int startStage, int) const - { - return gpuRunHaarClassifierCascade(cascade, pt, startStage); - } +int HaarClassifierCascade::runAt(Point pt, int startStage, int) const +{ + return gpuRunHaarClassifierCascade(cascade, pt, startStage); +} - void HaarClassifierCascade::setImages( const Mat &sum, const Mat &sqsum, - const Mat &tilted, double scale ) - { - CvMat _sum = sum, _sqsum = sqsum, _tilted = tilted; - gpuSetImagesForHaarClassifierCascade( cascade, &_sum, &_sqsum, &_tilted, scale ); - } +void HaarClassifierCascade::setImages( const Mat &sum, const Mat &sqsum, + const Mat &tilted, double scale ) +{ + CvMat _sum = sum, _sqsum = sqsum, _tilted = tilted; + gpuSetImagesForHaarClassifierCascade( cascade, &_sum, &_sqsum, &_tilted, scale ); +} } #endif @@ -2579,116 +2572,116 @@ CvPoint pt, int start_stage */) namespace cv { - namespace ocl +namespace ocl +{ + +struct gpuHaarDetectObjects_ScaleImage_Invoker +{ + gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade, + int _stripSize, double _factor, + const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1, + Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec ) { + cascade = _cascade; + stripSize = _stripSize; + factor = _factor; + sum1 = _sum1; + sqsum1 = _sqsum1; + norm1 = _norm1; + mask1 = _mask1; + equRect = _equRect; + vec = &_vec; + } - struct gpuHaarDetectObjects_ScaleImage_Invoker + void operator()( const BlockedRange &range ) const + { + Size winSize0 = cascade->orig_window_size; + Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor)); + int y1 = range.begin() * stripSize, y2 = min(range.end() * stripSize, sum1.rows - 1 - winSize0.height); + Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1); + int x, y, ystep = factor > 2 ? 1 : 2; + + for( y = y1; y < y2; y += ystep ) + for( x = 0; x < ssz.width; x += ystep ) + { + if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 ) + vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor), + winSize.width, winSize.height)); + } + } + + const CvHaarClassifierCascade *cascade; + int stripSize; + double factor; + Mat sum1, sqsum1, *norm1, *mask1; + Rect equRect; + ConcurrentRectVector *vec; +}; + + +struct gpuHaarDetectObjects_ScaleCascade_Invoker +{ + gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade, + Size _winsize, const Range &_xrange, double _ystep, + size_t _sumstep, const int **_p, const int **_pq, + ConcurrentRectVector &_vec ) + { + cascade = _cascade; + winsize = _winsize; + xrange = _xrange; + ystep = _ystep; + sumstep = _sumstep; + p = _p; + pq = _pq; + vec = &_vec; + } + + void operator()( const BlockedRange &range ) const + { + int iy, startY = range.begin(), endY = range.end(); + const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3]; + const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3]; + bool doCannyPruning = p0 != 0; + int sstep = (int)(sumstep / sizeof(p0[0])); + + for( iy = startY; iy < endY; iy++ ) { - gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade, - int _stripSize, double _factor, - const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1, - Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec ) + int ix, y = cvRound(iy * ystep), ixstep = 1; + for( ix = xrange.start; ix < xrange.end; ix += ixstep ) { - cascade = _cascade; - stripSize = _stripSize; - factor = _factor; - sum1 = _sum1; - sqsum1 = _sqsum1; - norm1 = _norm1; - mask1 = _mask1; - equRect = _equRect; - vec = &_vec; - } + int x = cvRound(ix * ystep); // it should really be ystep, not ixstep - void operator()( const BlockedRange &range ) const - { - Size winSize0 = cascade->orig_window_size; - Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor)); - int y1 = range.begin() * stripSize, y2 = min(range.end() * stripSize, sum1.rows - 1 - winSize0.height); - Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1); - int x, y, ystep = factor > 2 ? 1 : 2; - - for( y = y1; y < y2; y += ystep ) - for( x = 0; x < ssz.width; x += ystep ) - { - if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 ) - vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor), - winSize.width, winSize.height)); - } - } - - const CvHaarClassifierCascade *cascade; - int stripSize; - double factor; - Mat sum1, sqsum1, *norm1, *mask1; - Rect equRect; - ConcurrentRectVector *vec; - }; - - - struct gpuHaarDetectObjects_ScaleCascade_Invoker - { - gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade, - Size _winsize, const Range &_xrange, double _ystep, - size_t _sumstep, const int **_p, const int **_pq, - ConcurrentRectVector &_vec ) - { - cascade = _cascade; - winsize = _winsize; - xrange = _xrange; - ystep = _ystep; - sumstep = _sumstep; - p = _p; - pq = _pq; - vec = &_vec; - } - - void operator()( const BlockedRange &range ) const - { - int iy, startY = range.begin(), endY = range.end(); - const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3]; - const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3]; - bool doCannyPruning = p0 != 0; - int sstep = (int)(sumstep / sizeof(p0[0])); - - for( iy = startY; iy < endY; iy++ ) + if( doCannyPruning ) { - int ix, y = cvRound(iy * ystep), ixstep = 1; - for( ix = xrange.start; ix < xrange.end; ix += ixstep ) + int offset = y * sstep + x; + int s = p0[offset] - p1[offset] - p2[offset] + p3[offset]; + int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset]; + if( s < 100 || sq < 20 ) { - int x = cvRound(ix * ystep); // it should really be ystep, not ixstep - - if( doCannyPruning ) - { - int offset = y * sstep + x; - int s = p0[offset] - p1[offset] - p2[offset] + p3[offset]; - int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset]; - if( s < 100 || sq < 20 ) - { - ixstep = 2; - continue; - } - } - - int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */); - if( result > 0 ) - vec->push_back(Rect(x, y, winsize.width, winsize.height)); - ixstep = result != 0 ? 1 : 2; + ixstep = 2; + continue; } } + + int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */); + if( result > 0 ) + vec->push_back(Rect(x, y, winsize.width, winsize.height)); + ixstep = result != 0 ? 1 : 2; } - - const CvHaarClassifierCascade *cascade; - double ystep; - size_t sumstep; - Size winsize; - Range xrange; - const int **p; - const int **pq; - ConcurrentRectVector *vec; - }; - + } } + + const CvHaarClassifierCascade *cascade; + double ystep; + size_t sumstep; + Size winsize; + Range xrange; + const int **p; + const int **pq; + ConcurrentRectVector *vec; +}; + +} } /* diff --git a/modules/ocl/src/kernels/arithm_mul.cl b/modules/ocl/src/kernels/arithm_mul.cl index e0cfbd80f..f9f3936a4 100644 --- a/modules/ocl/src/kernels/arithm_mul.cl +++ b/modules/ocl/src/kernels/arithm_mul.cl @@ -16,6 +16,7 @@ // // @Authors // Jia Haipeng, jiahaipeng95@gmail.com +// Dachuan Zhao, dachuan@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -260,3 +261,22 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs } } #endif + +__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset, + __global float *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1, float scalar) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < cols && y < rows) + { + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); + int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); + + float data1 = *((__global float *)((__global char *)src1 + src1_index)); + float tmp = data1 * scalar; + + *((__global float *)((__global char *)dst + dst_index)) = tmp; + } +} \ No newline at end of file diff --git a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl b/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl index 14b68ea7a..22d3004e2 100644 --- a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl +++ b/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl @@ -44,75 +44,75 @@ //M*/ // Enter your kernel in this window -#pragma OPENCL EXTENSION cl_amd_printf:enable +//#pragma OPENCL EXTENSION cl_amd_printf:enable #define CV_HAAR_FEATURE_MAX 3 typedef int sumtype; typedef float sqsumtype; -typedef struct __attribute__((aligned (128))) GpuHidHaarFeature +typedef struct __attribute__((aligned(128))) GpuHidHaarFeature { - struct __attribute__((aligned (32))) - { - int p0 __attribute__((aligned (4))); - int p1 __attribute__((aligned (4))); - int p2 __attribute__((aligned (4))); - int p3 __attribute__((aligned (4))); - float weight __attribute__((aligned (4))); - } - rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32))); + struct __attribute__((aligned(32))) +{ + int p0 __attribute__((aligned(4))); + int p1 __attribute__((aligned(4))); + int p2 __attribute__((aligned(4))); + int p3 __attribute__((aligned(4))); + float weight __attribute__((aligned(4))); +} +rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32))); } GpuHidHaarFeature; -typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode +typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode { - int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64))); + int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64))); float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/; float threshold /*__attribute__((aligned (4)))*/; - float alpha[2] __attribute__((aligned (8))); - int left __attribute__((aligned (4))); - int right __attribute__((aligned (4))); + float alpha[2] __attribute__((aligned(8))); + int left __attribute__((aligned(4))); + int right __attribute__((aligned(4))); } GpuHidHaarTreeNode; -typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier +typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier { - int count __attribute__((aligned (4))); - GpuHidHaarTreeNode* node __attribute__((aligned (8))); - float* alpha __attribute__((aligned (8))); + int count __attribute__((aligned(4))); + GpuHidHaarTreeNode *node __attribute__((aligned(8))); + float *alpha __attribute__((aligned(8))); } GpuHidHaarClassifier; -typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier +typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier { - int count __attribute__((aligned (4))); - float threshold __attribute__((aligned (4))); - int two_rects __attribute__((aligned (4))); - int reserved0 __attribute__((aligned (8))); - int reserved1 __attribute__((aligned (8))); - int reserved2 __attribute__((aligned (8))); - int reserved3 __attribute__((aligned (8))); + int count __attribute__((aligned(4))); + float threshold __attribute__((aligned(4))); + int two_rects __attribute__((aligned(4))); + int reserved0 __attribute__((aligned(8))); + int reserved1 __attribute__((aligned(8))); + int reserved2 __attribute__((aligned(8))); + int reserved3 __attribute__((aligned(8))); } GpuHidHaarStageClassifier; -typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade +typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade { - int count __attribute__((aligned (4))); - int is_stump_based __attribute__((aligned (4))); - int has_tilted_features __attribute__((aligned (4))); - int is_tree __attribute__((aligned (4))); - int pq0 __attribute__((aligned (4))); - int pq1 __attribute__((aligned (4))); - int pq2 __attribute__((aligned (4))); - int pq3 __attribute__((aligned (4))); - int p0 __attribute__((aligned (4))); - int p1 __attribute__((aligned (4))); - int p2 __attribute__((aligned (4))); - int p3 __attribute__((aligned (4))); - float inv_window_area __attribute__((aligned (4))); -}GpuHidHaarClassifierCascade; + int count __attribute__((aligned(4))); + int is_stump_based __attribute__((aligned(4))); + int has_tilted_features __attribute__((aligned(4))); + int is_tree __attribute__((aligned(4))); + int pq0 __attribute__((aligned(4))); + int pq1 __attribute__((aligned(4))); + int pq2 __attribute__((aligned(4))); + int pq3 __attribute__((aligned(4))); + int p0 __attribute__((aligned(4))); + int p1 __attribute__((aligned(4))); + int p2 __attribute__((aligned(4))); + int p3 __attribute__((aligned(4))); + float inv_window_area __attribute__((aligned(4))); +} GpuHidHaarClassifierCascade; __kernel void gpuRunHaarClassifierCascade_scaled2( - global GpuHidHaarStageClassifier * stagecascadeptr, - global int4 * info, - global GpuHidHaarTreeNode * nodeptr, - global const int * restrict sum, - global const float * restrict sqsum, - global int4 * candidate, + global GpuHidHaarStageClassifier *stagecascadeptr, + global int4 *info, + global GpuHidHaarTreeNode *nodeptr, + global const int *restrict sum, + global const float *restrict sqsum, + global int4 *candidate, const int step, const int loopcount, const int start_stage, @@ -120,215 +120,167 @@ __kernel void gpuRunHaarClassifierCascade_scaled2( const int end_stage, const int startnode, const int splitnode, - global int4 * p, - //const int4 * pq, - global float * correction, - const int nodecount) + global int4 *p, + //const int4 * pq, + global float *correction, + const int nodecount) { - int grpszx = get_local_size(0); - int grpszy = get_local_size(1); - int grpnumx = get_num_groups(0); - int grpidx=get_group_id(0); - int lclidx = get_local_id(0); - int lclidy = get_local_id(1); - int lcl_sz = mul24(grpszx,grpszy); - int lcl_id = mad24(lclidy,grpszx,lclidx); - __local int lclshare[1024]; - __local int* glboutindex=lclshare+0; - __local int* lclcount=glboutindex+1; - __local int* lcloutindex=lclcount+1; - __local float* partialsum=(__local float*)(lcloutindex+(lcl_sz<<1)); - glboutindex[0]=0; - int outputoff = mul24(grpidx,256); - candidate[outputoff+(lcl_id<<2)] = (int4)0; - candidate[outputoff+(lcl_id<<2)+1] = (int4)0; - candidate[outputoff+(lcl_id<<2)+2] = (int4)0; - candidate[outputoff+(lcl_id<<2)+3] = (int4)0; - for(int scalei = 0; scalei > 16; - int height = scaleinfo1.x & 0xffff; - int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16; - int totalgrp = scaleinfo1.y & 0xffff; - float factor = as_float(scaleinfo1.w); - float correction_t=correction[scalei]; - int ystep=(int)(max(2.0f,factor)+0.5f); - for(int grploop=get_group_id(0);grploop=0.f ? sqrt(variance_norm_factor) : 1.f; - result = 1; - nodecounter = startnode+nodecount*scalei; - for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ ) - { - float stage_sum = 0.f; - int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); - for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ ) - { - __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); - int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - info1.x +=p_offset; - info1.z +=p_offset; - info2.x +=p_offset; - info2.z +=p_offset; - float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] - - sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x; - classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] - - sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y; - info3.x +=p_offset; - info3.z +=p_offset; - classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] - - sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z; - stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - nodecounter++; - } - result=(stage_sum>=stagethreshold); - } - if(result&&(ix0;stageloop++) - { - lclcount[0]=0; - barrier(CLK_LOCAL_MEM_FENCE); - int2 stageinfo=*(global int2*)(stagecascadeptr+stageloop); - float stagethreshold=as_float(stageinfo.y); - int perfscale=queuecount>4?3:2; - int queuecount_loop=(queuecount+(1<>perfscale; - int lcl_compute_win=lcl_sz>>perfscale; - int lcl_compute_win_id=(lcl_id>>(6-perfscale)); - int lcl_loops=(stageinfo.x+lcl_compute_win-1)>>(6-perfscale); - int lcl_compute_id=lcl_id-(lcl_compute_win_id<<(6-perfscale)); - for(int queueloop=0;queueloop>16),step,temp_coord&0xffff); - int tempnodecounter=lcl_compute_id; - float part_sum=0.f; - for(int lcl_loop=0;lcl_loopp[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - info1.x +=queue_offset; - info1.z +=queue_offset; - info2.x +=queue_offset; - info2.z +=queue_offset; - float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] - - sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x; - classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] - - sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y; + int grpszx = get_local_size(0); + int grpszy = get_local_size(1); + int grpnumx = get_num_groups(0); + int grpidx = get_group_id(0); + int lclidx = get_local_id(0); + int lclidy = get_local_id(1); + int lcl_sz = mul24(grpszx, grpszy); + int lcl_id = mad24(lclidy, grpszx, lclidx); + __local int lclshare[1024]; + __local int *glboutindex = lclshare + 0; + __local int *lclcount = glboutindex + 1; + __local int *lcloutindex = lclcount + 1; + __local float *partialsum = (__local float *)(lcloutindex + (lcl_sz << 1)); + glboutindex[0] = 0; + int outputoff = mul24(grpidx, 256); + candidate[outputoff + (lcl_id << 2)] = (int4)0; + candidate[outputoff + (lcl_id << 2) + 1] = (int4)0; + candidate[outputoff + (lcl_id << 2) + 2] = (int4)0; + candidate[outputoff + (lcl_id << 2) + 3] = (int4)0; - info3.x +=queue_offset; - info3.z +=queue_offset; - classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] - - sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z; - part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; - tempnodecounter+=lcl_compute_win; - } - partialsum[lcl_id]=part_sum; - barrier(CLK_LOCAL_MEM_FENCE); - for(int i=0;i=stagethreshold&&(lcl_compute_id==0)) - { - int queueindex=atomic_inc(lclcount); - lcloutindex[queueindex<<1]=temp_coord; - lcloutindex[(queueindex<<1)+1]=as_int(variance_norm_factor); - } - lcl_compute_win_id+=(1<>16; - temp=glboutindex[0]; - int4 candidate_result; - candidate_result.zw=(int2)convert_int_rtn(factor*20.f); - candidate_result.x=x; - candidate_result.y=y; - atomic_inc(glboutindex); - candidate[outputoff+temp+lcl_id]=candidate_result; - } - barrier(CLK_LOCAL_MEM_FENCE); - } + for (int scalei = 0; scalei < loopcount; scalei++) + { + int4 scaleinfo1; + scaleinfo1 = info[scalei]; + int width = (scaleinfo1.x & 0xffff0000) >> 16; + int height = scaleinfo1.x & 0xffff; + int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; + int totalgrp = scaleinfo1.y & 0xffff; + float factor = as_float(scaleinfo1.w); + float correction_t = correction[scalei]; + int ystep = (int)(max(2.0f, factor) + 0.5f); + + for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) + { + int4 cascadeinfo = p[scalei]; + int grpidy = grploop / grpnumperline; + int grpidx = grploop - mul24(grpidy, grpnumperline); + int ix = mad24(grpidx, grpszx, lclidx); + int iy = mad24(grpidy, grpszy, lclidy); + int x = ix * ystep; + int y = iy * ystep; + lcloutindex[lcl_id] = 0; + lclcount[0] = 0; + int result = 1, nodecounter; + float mean, variance_norm_factor; + //if((ix < width) && (iy < height)) + { + const int p_offset = mad24(y, step, x); + cascadeinfo.x += p_offset; + cascadeinfo.z += p_offset; + mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] - + sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)]) + * correction_t; + variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] - + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)]; + variance_norm_factor = variance_norm_factor * correction_t - mean * mean; + variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; + result = 1; + nodecounter = startnode + nodecount * scalei; + + for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++) + { + float stage_sum = 0.f; + int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop); + float stagethreshold = as_float(stageinfo.y); + + for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++) + { + __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); + int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); + int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); + int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); + float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); + float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0])); + float nodethreshold = w.w * variance_norm_factor; + info1.x += p_offset; + info1.z += p_offset; + info2.x += p_offset; + info2.z += p_offset; + float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] - + sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x; + classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] - + sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y; + info3.x += p_offset; + info3.z += p_offset; + classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] - + sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z; + stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x; + nodecounter++; + } + + result = (stage_sum >= stagethreshold); + } + + if (result && (ix < width) && (iy < height)) + { + int queueindex = atomic_inc(lclcount); + lcloutindex[queueindex << 1] = (y << 16) | x; + lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor); + } + + barrier(CLK_LOCAL_MEM_FENCE); + int queuecount = lclcount[0]; + nodecounter = splitnode + nodecount * scalei; + + if (lcl_id < queuecount) + { + int temp = lcloutindex[lcl_id << 1]; + int x = temp & 0xffff; + int y = (temp & (int)0xffff0000) >> 16; + temp = glboutindex[0]; + int4 candidate_result; + candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); + candidate_result.x = x; + candidate_result.y = y; + atomic_inc(glboutindex); + candidate[outputoff + temp + lcl_id] = candidate_result; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + } } - } } -__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode * orinode, global GpuHidHaarTreeNode * newnode,float scale,float weight_scale,int nodenum) +__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) { - int counter=get_global_id(0); - int tr_x[3],tr_y[3],tr_h[3],tr_w[3],i=0; - GpuHidHaarTreeNode t1 = *(orinode + counter); - #pragma unroll - for(i=0;i<3;i++){ - tr_x[i]=(int)(t1.p[i][0]*scale+0.5f); - tr_y[i]=(int)(t1.p[i][1]*scale+0.5f); - tr_w[i]=(int)(t1.p[i][2]*scale+0.5f); - tr_h[i]=(int)(t1.p[i][3]*scale+0.5f); - } - t1.weight[0]=t1.p[2][0]?-(t1.weight[1]*tr_h[1]*tr_w[1]+t1.weight[2]*tr_h[2]*tr_w[2])/(tr_h[0]*tr_w[0]):-t1.weight[1]*tr_h[1]*tr_w[1]/(tr_h[0]*tr_w[0]); - counter+=nodenum; - #pragma unroll - for(i=0;i<3;i++) - { - newnode[counter].p[i][0]=tr_x[i]; - newnode[counter].p[i][1]=tr_y[i]; - newnode[counter].p[i][2]=tr_x[i]+tr_w[i]; - newnode[counter].p[i][3]=tr_y[i]+tr_h[i]; - newnode[counter].weight[i]=t1.weight[i]*weight_scale; - } - newnode[counter].left=t1.left; - newnode[counter].right=t1.right; - newnode[counter].threshold=t1.threshold; - newnode[counter].alpha[0]=t1.alpha[0]; - newnode[counter].alpha[1]=t1.alpha[1]; + int counter = get_global_id(0); + int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; + GpuHidHaarTreeNode t1 = *(orinode + counter); +#pragma unroll + + for (i = 0; i < 3; i++) + { + tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); + tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); + tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); + tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); + } + + t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); + counter += nodenum; +#pragma unroll + + for (i = 0; i < 3; i++) + { + newnode[counter].p[i][0] = tr_x[i]; + newnode[counter].p[i][1] = tr_y[i]; + newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; + newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; + newnode[counter].weight[i] = t1.weight[i] * weight_scale; + } + + newnode[counter].left = t1.left; + newnode[counter].right = t1.right; + newnode[counter].threshold = t1.threshold; + newnode[counter].alpha[0] = t1.alpha[0]; + newnode[counter].alpha[1] = t1.alpha[1]; } diff --git a/modules/ocl/src/kernels/pyrlk.cl b/modules/ocl/src/kernels/pyrlk.cl index ecdacf3b4..c772be78a 100644 --- a/modules/ocl/src/kernels/pyrlk.cl +++ b/modules/ocl/src/kernels/pyrlk.cl @@ -16,6 +16,7 @@ // // @Authors // Dachuan Zhao, dachuan@multicorewareinc.com +// Yao Wang, bitwangyaoyao@gmail.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -45,26 +46,6 @@ //#pragma OPENCL EXTENSION cl_amd_printf : enable -__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); - int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); - - float data1 = *((__global float *)((__global char *)src1 + src1_index)); - float tmp = data1 * scalar; - - *((__global float *)((__global char *)dst + dst_index)) = tmp; - } -} - - __kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) { const int x = get_global_id(0); @@ -202,6 +183,7 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2 return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5; } +#define BUFFER 64 void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; @@ -209,6 +191,7 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f smem3[tid] = val3; barrier(CLK_LOCAL_MEM_FENCE); +#if BUFFER > 128 if (tid < 128) { smem1[tid] = val1 += smem1[tid + 128]; @@ -216,7 +199,9 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f smem3[tid] = val3 += smem3[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); +#endif +#if BUFFER > 64 if (tid < 64) { smem1[tid] = val1 += smem1[tid + 64]; @@ -224,6 +209,7 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local f smem3[tid] = val3 += smem3[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); +#endif if (tid < 32) { @@ -263,19 +249,23 @@ void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, smem2[tid] = val2; barrier(CLK_LOCAL_MEM_FENCE); +#if BUFFER > 128 if (tid < 128) { smem1[tid] = val1 += smem1[tid + 128]; smem2[tid] = val2 += smem2[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); +#endif +#if BUFFER > 64 if (tid < 64) { smem1[tid] = val1 += smem1[tid + 64]; smem2[tid] = val2 += smem2[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); +#endif if (tid < 32) { @@ -307,17 +297,21 @@ void reduce1(float val1, __local float* smem1, int tid) smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); +#if BUFFER > 128 if (tid < 128) { smem1[tid] = val1 += smem1[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); +#endif +#if BUFFER > 64 if (tid < 64) { smem1[tid] = val1 += smem1[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); +#endif if (tid < 32) { @@ -333,60 +327,17 @@ void reduce1(float val1, __local float* smem1, int tid) } #define SCALE (1.0f / (1 << 20)) +#define THRESHOLD 0.01f +#define DIMENSION 21 // Image read mode __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; -__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, - __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status/*, __global float* err*/, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr, char GET_MIN_EIGENVALS) +void SetPatch(image2d_t I, float x, float y, + float* Pch, float* Dx, float* Dy, + float* A11, float* A12, float* A22) { - __local float smem1[256]; - __local float smem2[256]; - __local float smem3[256]; - - int c_halfWin_x = (c_winSize_x - 1) / 2; - int c_halfWin_y = (c_winSize_y - 1) / 2; - - const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); - - float2 prevPt = prevPts[get_group_id(0)]; - prevPt.x *= (1.0f / (1 << level)); - prevPt.y *= (1.0f / (1 << level)); - - if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) - { - if (level == 0 && tid == 0) - { - status[get_group_id(0)] = 0; - - //if (calcErr) - // err[get_group_id(0)] = 0; - } - - return; - } - - prevPt.x -= c_halfWin_x; - prevPt.y -= c_halfWin_y; - - // extract the patch from the first image, compute covariation matrix of derivatives - - float A11 = 0; - float A12 = 0; - float A22 = 0; - - float I_patch[21][21]; - float dIdx_patch[21][21]; - float dIdy_patch[21][21]; - - for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i) - { - for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j) - { - float x = (prevPt.x + xBase + 0.5f); - float y = (prevPt.y + yBase + 0.5f); - - I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x; + *Pch = read_imagef(I, sampler, (float2)(x, y)).x; float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); @@ -394,158 +345,104 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); - dIdx_patch[i][j] = dIdx; - dIdy_patch[i][j] = dIdy; - A11 += dIdx * dIdx; - A12 += dIdx * dIdy; - A22 += dIdy * dIdy; - } - } + *Dx = dIdx; + *Dy = dIdy; - reduce3(A11, A12, A22, smem1, smem2, smem3, tid); - barrier(CLK_LOCAL_MEM_FENCE); - - A11 = smem1[0]; - A12 = smem2[0]; - A22 = smem3[0]; - - float D = A11 * A22 - A12 * A12; - - //if (calcErr && GET_MIN_EIGENVALS && tid == 0) - // err[get_group_id(0)] = minEig; - - if (D < 1.192092896e-07f) - { - if (level == 0 && tid == 0) - status[get_group_id(0)] = 0; - - return; - } - - D = 1.f / D; - - A11 *= D; - A12 *= D; - A22 *= D; - - float2 nextPt = nextPts[get_group_id(0)]; - nextPt.x *= 2.0f; - nextPt.y *= 2.0f; - - nextPt.x -= c_halfWin_x; - nextPt.y -= c_halfWin_y; - - for (int k = 0; k < c_iters; ++k) - { - if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) - { - if (tid == 0 && level == 0) - status[get_group_id(0)] = 0; - return; - } - - float b1 = 0; - float b2 = 0; - - for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) - { - for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) - { - float a = (nextPt.x + x + 0.5f); - float b = (nextPt.y + y + 0.5f); - - float I_val = I_patch[i][j]; - float J_val = read_imagef(J, sampler, (float2)(a, b)).x; - - float diff = (J_val - I_val) * 32.0f; - - b1 += diff * dIdx_patch[i][j]; - b2 += diff * dIdy_patch[i][j]; - } - } - - reduce2(b1, b2, smem1, smem2, tid); - barrier(CLK_LOCAL_MEM_FENCE); - - b1 = smem1[0]; - b2 = smem2[0]; - - float2 delta; - delta.x = A12 * b2 - A22 * b1; - delta.y = A12 * b1 - A11 * b2; - - nextPt.x += delta.x; - nextPt.y += delta.y; - - if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f) - break; - } - - float errval = 0.0f; - if (calcErr) - { - for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) - { - for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) - { - float a = (nextPt.x + x + 0.5f); - float b = (nextPt.y + y + 0.5f); - - float I_val = I_patch[i][j]; - float J_val = read_imagef(J, sampler, (float2)(a, b)).x; - - float diff = J_val - I_val; - - errval += fabs((float)diff); - } - } - - reduce1(errval, smem1, tid); - } - - if (tid == 0) - { - nextPt.x += c_halfWin_x; - nextPt.y += c_halfWin_y; - - nextPts[get_group_id(0)] = nextPt; - - //if (calcErr && !GET_MIN_EIGENVALS) - // err[get_group_id(0)] = errval; - } + *A11 += dIdx * dIdx; + *A12 += dIdx * dIdy; + *A22 += dIdy * dIdy; } -__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, - __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status/*, __global float* err*/, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr, char GET_MIN_EIGENVALS) + +void GetPatch(image2d_t J, float x, float y, + float* Pch, float* Dx, float* Dy, + float* b1, float* b2) { - __local float smem1[256]; - __local float smem2[256]; - __local float smem3[256]; + float J_val = read_imagef(J, sampler, (float2)(x, y)).x; + float diff = (J_val - *Pch) * 32.0f; + *b1 += diff**Dx; + *b2 += diff**Dy; +} - int c_halfWin_x = (c_winSize_x - 1) / 2; - int c_halfWin_y = (c_winSize_y - 1) / 2; +void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval) +{ + float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; + *errval += fabs(diff); +} - const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); +void SetPatch4(image2d_t I, const float x, const float y, + float4* Pch, float4* Dx, float4* Dy, + float* A11, float* A12, float* A22) +{ + *Pch = read_imagef(I, sampler, (float2)(x, y)); - float2 prevPt = prevPts[get_group_id(0)]; - prevPt.x *= (1.0f / (1 << level)); - prevPt.y *= (1.0f / (1 << level)); + float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1))); + + float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1))); + + + *Dx = dIdx; + *Dy = dIdy; + float4 sqIdx = dIdx * dIdx; + *A11 += sqIdx.x + sqIdx.y + sqIdx.z; + sqIdx = dIdx * dIdy; + *A12 += sqIdx.x + sqIdx.y + sqIdx.z; + sqIdx = dIdy * dIdy; + *A22 += sqIdx.x + sqIdx.y + sqIdx.z; +} + +void GetPatch4(image2d_t J, const float x, const float y, + const float4* Pch, const float4* Dx, const float4* Dy, + float* b1, float* b2) +{ + float4 J_val = read_imagef(J, sampler, (float2)(x, y)); + float4 diff = (J_val - *Pch) * 32.0f; + float4 xdiff = diff* *Dx; + *b1 += xdiff.x + xdiff.y + xdiff.z; + xdiff = diff* *Dy; + *b2 += xdiff.x + xdiff.y + xdiff.z; +} + +void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval) +{ + float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch; + *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); +} + + +__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, + __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, + const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) +{ + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; + + unsigned int xid=get_local_id(0); + unsigned int yid=get_local_id(1); + unsigned int gid=get_group_id(0); + unsigned int xsize=get_local_size(0); + unsigned int ysize=get_local_size(1); + int xBase, yBase, i, j, k; + + float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); + + const int tid = mad24(yid, xsize, xid); + + float2 prevPt = prevPts[gid] / (1 << level); if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) { - if (level == 0 && tid == 0) + if (tid == 0 && level == 0) { - status[get_group_id(0)] = 0; - - //if (calcErr) - // err[get_group_id(0)] = 0; + status[gid] = 0; } return; } - - prevPt.x -= c_halfWin_x; - prevPt.y -= c_halfWin_y; + prevPt -= c_halfWin; // extract the patch from the first image, compute covariation matrix of derivatives @@ -553,34 +450,68 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, float A12 = 0; float A22 = 0; - float4 I_patch[21][21]; - float4 dIdx_patch[21][21]; - float4 dIdy_patch[21][21]; + float I_patch[3][3]; + float dIdx_patch[3][3]; + float dIdy_patch[3][3]; - for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i) - { - for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j) + yBase=yid; { - float x = (prevPt.x + xBase + 0.5f); - float y = (prevPt.y + yBase + 0.5f); + xBase=xid; + SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], + &A11, &A12, &A22); - I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x; - float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); + xBase+=xsize; + SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], + &A11, &A12, &A22); - float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); - - dIdx_patch[i][j] = dIdx; - dIdy_patch[i][j] = dIdy; - - A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z; - A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z; - A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z; + xBase+=xsize; + if(xBase= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) + if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows) { if (tid == 0 && level == 0) - status[get_group_id(0)] = 0; + status[gid] = 0; return; } float b1 = 0; float b2 = 0; - for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) - { - for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) - { - float a = (nextPt.x + x + 0.5f); - float b = (nextPt.y + y + 0.5f); + yBase=yid; + { + xBase=xid; + GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], + &b1, &b2); - float4 I_val = I_patch[i][j]; - float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x; - float4 diff = (J_val - I_val) * 32.0f; + xBase+=xsize; + GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], + &b1, &b2); - b1 += (diff * dIdx_patch[i][j]).x + (diff * dIdx_patch[i][j]).y + (diff * dIdx_patch[i][j]).z; - b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z; - } - } + xBase+=xsize; + if(xBase>1, (c_winSize_y - 1)>>1); + + const int tid = mad24(yid, xsize, xid); + + float2 nextPt = prevPts[gid]/(1<= cols || nextPt.y < 0 || nextPt.y >= rows) + { + if (tid == 0 && level == 0) + { + status[gid] = 0; + } + + return; + } + + nextPt -= c_halfWin; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + float4 I_patch[8]; + float4 dIdx_patch[8]; + float4 dIdy_patch[8]; + float4 I_add,Dx_add,Dy_add; + + yBase=yid; + { + xBase=xid; + SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, + &I_patch[0], &dIdx_patch[0], &dIdy_patch[0], + &A11, &A12, &A22); + + + xBase+=xsize; + SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, + &I_patch[1], &dIdx_patch[1], &dIdy_patch[1], + &A11, &A12, &A22); + + xBase+=xsize; + if(xBase= cols || nextPt.y < -c_halfWin.y || nextPt.y >= rows) + { + if (tid == 0 && level == 0) + status[gid] = 0; + return; + } + + float b1 = 0; + float b2 = 0; + + yBase=yid; + { + xBase=xid; + GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, + &I_patch[0], &dIdx_patch[0], &dIdy_patch[0], + &b1, &b2); + + + xBase+=xsize; + GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, + &I_patch[1], &dIdx_patch[1], &dIdy_patch[1], + &b1, &b2); + + xBase+=xsize; + if(xBase 128 + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } + + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + smem3[tid] = val3 += smem3[tid + 64]; + } + + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + smem2[tid] = val2 += smem2[tid + 32]; + smem3[tid] = val3 += smem3[tid + 32]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = val1 += smem1[tid + 16]; + smem2[tid] = val2 += smem2[tid + 16]; + smem3[tid] = val3 += smem3[tid + 16]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + volatile __local float *vmem1 = smem1; + volatile __local float *vmem2 = smem2; + volatile __local float *vmem3 = smem3; + + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem2[tid] = val2 += vmem2[tid + 8]; + vmem3[tid] = val3 += vmem3[tid + 8]; + + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem2[tid] = val2 += vmem2[tid + 4]; + vmem3[tid] = val3 += vmem3[tid + 4]; + + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem2[tid] = val2 += vmem2[tid + 2]; + vmem3[tid] = val3 += vmem3[tid + 2]; + + vmem1[tid] = val1 += vmem1[tid + 1]; + vmem2[tid] = val2 += vmem2[tid + 1]; + vmem3[tid] = val3 += vmem3[tid + 1]; + } +} + +void reduce2(float val1, float val2, __local float *smem1, __local float *smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + } + + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + } + + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + smem2[tid] = val2 += smem2[tid + 32]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = val1 += smem1[tid + 16]; + smem2[tid] = val2 += smem2[tid + 16]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + volatile __local float *vmem1 = smem1; + volatile __local float *vmem2 = smem2; + + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem2[tid] = val2 += vmem2[tid + 8]; + + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem2[tid] = val2 += vmem2[tid + 4]; + + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem2[tid] = val2 += vmem2[tid + 2]; + + vmem1[tid] = val1 += vmem1[tid + 1]; + vmem2[tid] = val2 += vmem2[tid + 1]; + } +} + +void reduce1(float val1, __local float *smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + } + + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + } + + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + volatile __local float *vmem1 = smem1; + + vmem1[tid] = val1 += vmem1[tid + 16]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + volatile __local float *vmem1 = smem1; + + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem1[tid] = val1 += vmem1[tid + 1]; + } +} + +#define SCALE (1.0f / (1 << 20)) +#define THRESHOLD 0.01f +#define DIMENSION 21 + +float readImage2Df_C1(__global const float *image, const float x, const float y, const int rows, const int cols, const int elemCntPerRow) +{ + float2 coor = (float2)(x, y); + + int i0 = clamp((int)floor(coor.x), 0, cols - 1); + int j0 = clamp((int)floor(coor.y), 0, rows - 1); + int i1 = clamp((int)floor(coor.x) + 1, 0, cols - 1); + int j1 = clamp((int)floor(coor.y) + 1, 0, rows - 1); + float a = coor.x - floor(coor.x); + float b = coor.y - floor(coor.y); + + return (1 - a) * (1 - b) * image[mad24(j0, elemCntPerRow, i0)] + + a * (1 - b) * image[mad24(j0, elemCntPerRow, i1)] + + (1 - a) * b * image[mad24(j1, elemCntPerRow, i0)] + + a * b * image[mad24(j1, elemCntPerRow, i1)]; +} + +__kernel void lkSparse_C1_D5(__global const float *I, __global const float *J, + __global const float2 *prevPts, int prevPtsStep, __global float2 *nextPts, int nextPtsStep, __global uchar *status, __global float *err, + const int level, const int rows, const int cols, const int elemCntPerRow, + int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) +{ + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; + + float2 c_halfWin = (float2)((c_winSize_x - 1) >> 1, (c_winSize_y - 1) >> 1); + + const int tid = mad24(get_local_id(1), get_local_size(0), get_local_id(0)); + + float2 prevPt = prevPts[get_group_id(0)] * (1.0f / (1 << level)); + + if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + { + if (tid == 0 && level == 0) + { + status[get_group_id(0)] = 0; + } + + return; + } + + prevPt -= c_halfWin; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + float I_patch[1][3]; + float dIdx_patch[1][3]; + float dIdy_patch[1][3]; + + for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i) + { + for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j) + { + float x = (prevPt.x + xBase); + float y = (prevPt.y + yBase); + + I_patch[i][j] = readImage2Df_C1(I, x, y, rows, cols, elemCntPerRow); + float dIdx = 3.0f * readImage2Df_C1(I, x + 1, y - 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C1(I, x + 1, y, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C1(I, x + 1, y + 1, rows, cols, elemCntPerRow) - + (3.0f * readImage2Df_C1(I, x - 1, y - 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C1(I, x - 1, y, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C1(I, x - 1, y + 1, rows, cols, elemCntPerRow)); + + float dIdy = 3.0f * readImage2Df_C1(I, x - 1, y + 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C1(I, x, y + 1, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C1(I, x + 1, y + 1, rows, cols, elemCntPerRow) - + (3.0f * readImage2Df_C1(I, x - 1, y - 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C1(I, x, y - 1, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C1(I, x + 1, y - 1, rows, cols, elemCntPerRow)); + + dIdx_patch[i][j] = dIdx; + dIdy_patch[i][j] = dIdy; + + A11 += dIdx * dIdx; + A12 += dIdx * dIdy; + A22 += dIdy * dIdy; + } + } + + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + A11 = smem1[0]; + A12 = smem2[0]; + A22 = smem3[0]; + + float D = A11 * A22 - A12 * A12; + + if (D < 1.192092896e-07f) + { + if (tid == 0 && level == 0) + { + status[get_group_id(0)] = 0; + } + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt = nextPts[get_group_id(0)]; + nextPt = nextPt * 2.0f - c_halfWin; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < -c_halfWin.x || nextPt.x >= cols || nextPt.y < -c_halfWin.y || nextPt.y >= rows) + { + if (tid == 0 && level == 0) + { + status[get_group_id(0)] = 0; + } + + return; + } + + float b1 = 0; + float b2 = 0; + + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float diff = (readImage2Df_C1(J, nextPt.x + x, nextPt.y + y, rows, cols, elemCntPerRow) - I_patch[i][j]) * 32.0f; + + b1 += diff * dIdx_patch[i][j]; + b2 += diff * dIdy_patch[i][j]; + } + } + + reduce2(b1, b2, smem1, smem2, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + b1 = smem1[0]; + b2 = smem2[0]; + + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; + + nextPt += delta; + + //if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD) + // break; + } + + float errval = 0.0f; + + if (calcErr) + { + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float diff = readImage2Df_C1(J, nextPt.x + x, nextPt.y + y, rows, cols, elemCntPerRow) - I_patch[i][j]; + + errval += fabs(diff); + } + } + + reduce1(errval, smem1, tid); + } + + if (tid == 0) + { + nextPt += c_halfWin; + + nextPts[get_group_id(0)] = nextPt; + + if (calcErr) + { + err[get_group_id(0)] = smem1[0] / (c_winSize_x * c_winSize_y); + } + } +} + +float4 readImage2Df_C4(__global const float4 *image, const float x, const float y, const int rows, const int cols, const int elemCntPerRow) +{ + float2 coor = (float2)(x, y); + + int i0 = clamp((int)floor(coor.x), 0, cols - 1); + int j0 = clamp((int)floor(coor.y), 0, rows - 1); + int i1 = clamp((int)floor(coor.x) + 1, 0, cols - 1); + int j1 = clamp((int)floor(coor.y) + 1, 0, rows - 1); + float a = coor.x - floor(coor.x); + float b = coor.y - floor(coor.y); + + return (1 - a) * (1 - b) * image[mad24(j0, elemCntPerRow, i0)] + + a * (1 - b) * image[mad24(j0, elemCntPerRow, i1)] + + (1 - a) * b * image[mad24(j1, elemCntPerRow, i0)] + + a * b * image[mad24(j1, elemCntPerRow, i1)]; +} + +__kernel void lkSparse_C4_D5(__global const float *I, __global const float *J, + __global const float2 *prevPts, int prevPtsStep, __global float2 *nextPts, int nextPtsStep, __global uchar *status, __global float *err, + const int level, const int rows, const int cols, const int elemCntPerRow, + int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) +{ + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; + + float2 c_halfWin = (float2)((c_winSize_x - 1) >> 1, (c_winSize_y - 1) >> 1); + + const int tid = mad24(get_local_id(1), get_local_size(0), get_local_id(0)); + + float2 prevPt = prevPts[get_group_id(0)] * (1.0f / (1 << level)); + + if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + { + if (tid == 0 && level == 0) + { + status[get_group_id(0)] = 0; + } + + return; + } + + prevPt -= c_halfWin; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + float4 I_patch[1][3]; + float4 dIdx_patch[1][3]; + float4 dIdy_patch[1][3]; + + __global float4 *ptrI = (__global float4 *)I; + + for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i) + { + for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j) + { + float x = (prevPt.x + xBase); + float y = (prevPt.y + yBase); + + I_patch[i][j] = readImage2Df_C4(ptrI, x, y, rows, cols, elemCntPerRow); + + float4 dIdx = 3.0f * readImage2Df_C4(ptrI, x + 1, y - 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C4(ptrI, x + 1, y, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C4(ptrI, x + 1, y + 1, rows, cols, elemCntPerRow) - + (3.0f * readImage2Df_C4(ptrI, x - 1, y - 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C4(ptrI, x - 1, y, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C4(ptrI, x - 1, y + 1, rows, cols, elemCntPerRow)); + + float4 dIdy = 3.0f * readImage2Df_C4(ptrI, x - 1, y + 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C4(ptrI, x, y + 1, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C4(ptrI, x + 1, y + 1, rows, cols, elemCntPerRow) - + (3.0f * readImage2Df_C4(ptrI, x - 1, y - 1, rows, cols, elemCntPerRow) + 10.0f * readImage2Df_C4(ptrI, x, y - 1, rows, cols, elemCntPerRow) + 3.0f * readImage2Df_C4(ptrI, x + 1, y - 1, rows, cols, elemCntPerRow)); + + dIdx_patch[i][j] = dIdx; + dIdy_patch[i][j] = dIdy; + + A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z; + A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z; + A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z; + } + } + + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + A11 = smem1[0]; + A12 = smem2[0]; + A22 = smem3[0]; + + float D = A11 * A22 - A12 * A12; + //pD[get_group_id(0)] = D; + + if (D < 1.192092896e-07f) + { + if (tid == 0 && level == 0) + { + status[get_group_id(0)] = 0; + } + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt = nextPts[get_group_id(0)]; + + nextPt = nextPt * 2.0f - c_halfWin; + + __global float4 *ptrJ = (__global float4 *)J; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < -c_halfWin.x || nextPt.x >= cols || nextPt.y < -c_halfWin.y || nextPt.y >= rows) + { + if (tid == 0 && level == 0) + { + status[get_group_id(0)] = 0; + } + + return; + } + + float b1 = 0; + float b2 = 0; + + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float4 diff = (readImage2Df_C4(ptrJ, nextPt.x + x, nextPt.y + y, rows, cols, elemCntPerRow) - I_patch[i][j]) * 32.0f; + + b1 += (diff * dIdx_patch[i][j]).x + (diff * dIdx_patch[i][j]).y + (diff * dIdx_patch[i][j]).z; + b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z; + } + } + + reduce2(b1, b2, smem1, smem2, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + b1 = smem1[0]; + b2 = smem2[0]; + + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; + + nextPt += delta; + + //if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD) + // break; + } + + float errval = 0.0f; + + if (calcErr) + { + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float4 diff = readImage2Df_C4(ptrJ, nextPt.x + x, nextPt.y + y, rows, cols, elemCntPerRow) - I_patch[i][j]; + + errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); + } + } + + reduce1(errval, smem1, tid); + } + + if (tid == 0) + { + nextPt += c_halfWin; + nextPts[get_group_id(0)] = nextPt; + + if (calcErr) + { + err[get_group_id(0)] = smem1[0] / (3 * c_winSize_x * c_winSize_y); + } + } +} + +int readImage2Di_C1(__global const int *image, float2 coor, int2 size, const int elemCntPerRow) +{ + int i = clamp((int)floor(coor.x), 0, size.x - 1); + int j = clamp((int)floor(coor.y), 0, size.y - 1); + return image[mad24(j, elemCntPerRow, i)]; +} + +__kernel void lkDense_C1_D0(__global const int *I, __global const int *J, __global float *u, int uStep, __global float *v, int vStep, __global const float *prevU, int prevUStep, __global const float *prevV, int prevVStep, + const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ + const int elemCntPerRow, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) +{ + int c_halfWin_x = (c_winSize_x - 1) / 2; + int c_halfWin_y = (c_winSize_y - 1) / 2; + + const int patchWidth = get_local_size(0) + 2 * c_halfWin_x; + const int patchHeight = get_local_size(1) + 2 * c_halfWin_y; + + __local int smem[8192]; + + __local int *I_patch = smem; + __local int *dIdx_patch = I_patch + patchWidth * patchHeight; + __local int *dIdy_patch = dIdx_patch + patchWidth * patchHeight; + + const int xBase = get_group_id(0) * get_local_size(0); + const int yBase = get_group_id(1) * get_local_size(1); + int2 size = (int2)(cols, rows); + + for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1)) + { + for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0)) + { + float x = xBase - c_halfWin_x + j + 0.5f; + float y = yBase - c_halfWin_y + i + 0.5f; + + I_patch[i * patchWidth + j] = readImage2Di_C1(I, (float2)(x, y), size, elemCntPerRow); + + // Sharr Deriv + + dIdx_patch[i * patchWidth + j] = 3 * readImage2Di_C1(I, (float2)(x + 1, y - 1), size, elemCntPerRow) + 10 * readImage2Di_C1(I, (float2)(x + 1, y), size, elemCntPerRow) + 3 * readImage2Di_C1(I, (float2)(x + 1, y + 1), size, elemCntPerRow) - + (3 * readImage2Di_C1(I, (float2)(x - 1, y - 1), size, elemCntPerRow) + 10 * readImage2Di_C1(I, (float2)(x - 1, y), size, elemCntPerRow) + 3 * readImage2Di_C1(I, (float2)(x - 1, y + 1), size, elemCntPerRow)); + + dIdy_patch[i * patchWidth + j] = 3 * readImage2Di_C1(I, (float2)(x - 1, y + 1), size, elemCntPerRow) + 10 * readImage2Di_C1(I, (float2)(x, y + 1), size, elemCntPerRow) + 3 * readImage2Di_C1(I, (float2)(x + 1, y + 1), size, elemCntPerRow) - + (3 * readImage2Di_C1(I, (float2)(x - 1, y - 1), size, elemCntPerRow) + 10 * readImage2Di_C1(I, (float2)(x, y - 1), size, elemCntPerRow) + 3 * readImage2Di_C1(I, (float2)(x + 1, y - 1), size, elemCntPerRow)); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // extract the patch from the first image, compute covariation matrix of derivatives + + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= cols || y >= rows) + { + return; + } + + int A11i = 0; + int A12i = 0; + int A22i = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + + A11i += dIdx * dIdx; + A12i += dIdx * dIdy; + A22i += dIdy * dIdy; + } + } + + float A11 = A11i; + float A12 = A12i; + float A22 = A22i; + + float D = A11 * A22 - A12 * A12; + + //if (calcErr && GET_MIN_EIGENVALS) + // (err + y * errStep)[x] = minEig; + + if (D < 1.192092896e-07f) + { + //if (calcErr) + // err(y, x) = 3.402823466e+38f; + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt; + nextPt.x = x + prevU[y / 2 * prevUStep / 4 + x / 2] * 2.0f; + nextPt.y = y + prevV[y / 2 * prevVStep / 4 + x / 2] * 2.0f; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) + { + //if (calcErr) + // err(y, x) = 3.402823466e+38f; + + return; + } + + int b1 = 0; + int b2 = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j]; + int iJ = readImage2Di_C1(J, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f), size, elemCntPerRow); + + int diff = (iJ - iI) * 32; + + int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + + b1 += diff * dIdx; + b2 += diff * dIdy; + } + } + + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; + + nextPt.x += delta.x; + nextPt.y += delta.y; + + if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f) + { + break; + } + } + + u[y * uStep / 4 + x] = nextPt.x - x; + v[y * vStep / 4 + x] = nextPt.y - y; + + if (calcErr) + { + int errval = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j]; + int iJ = readImage2Di_C1(J, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f), size, elemCntPerRow); + + errval += abs(iJ - iI); + } + } + + //err[y * errStep / 4 + x] = static_cast(errval) / (c_winSize_x * c_winSize_y); + } +} diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index dac303c65..7165a8c02 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -48,23 +48,24 @@ using namespace cv::ocl; #if !defined (HAVE_OPENCL) -void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &, const oclMat &, const oclMat &, oclMat &, oclMat &, oclMat *) { } +void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &, const oclMat &, const oclMat &, oclMat &, oclMat &, oclMat &) { } void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &, const oclMat &, oclMat &, oclMat &, oclMat *) { } #else /* !defined (HAVE_OPENCL) */ namespace cv { - namespace ocl - { - ///////////////////////////OpenCL kernel strings/////////////////////////// - extern const char *pyrlk; - extern const char *operator_setTo; - extern const char *operator_convertTo; - extern const char *operator_copyToM; - extern const char *arithm_mul; - extern const char *pyr_down; - } +namespace ocl +{ +///////////////////////////OpenCL kernel strings/////////////////////////// +extern const char *pyrlk; +extern const char *pyrlk_no_image; +extern const char *operator_setTo; +extern const char *operator_convertTo; +extern const char *operator_copyToM; +extern const char *arithm_mul; +extern const char *pyr_down; +} } struct dim3 @@ -84,26 +85,26 @@ struct int2 namespace { - void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11) +void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11) +{ + winSize.width *= cn; + + if (winSize.width > 32 && winSize.width > 2 * winSize.height) { - winSize.width *= cn; - - if (winSize.width > 32 && winSize.width > 2 * winSize.height) - { - block.x = isDeviceArch11 ? 16 : 32; - block.y = 8; - } - else - { - block.x = 16; - block.y = isDeviceArch11 ? 8 : 16; - } - - patch.x = (winSize.width + block.x - 1) / block.x; - patch.y = (winSize.height + block.y - 1) / block.y; - - block.z = patch.z = 1; + block.x = isDeviceArch11 ? 16 : 32; + block.y = 8; } + else + { + block.x = 16; + block.y = isDeviceArch11 ? 8 : 16; + } + + patch.x = (winSize.width + block.x - 1) / block.x; + patch.y = (winSize.height + block.y - 1) / block.y; + + block.z = patch.z = 1; +} } inline int divUp(int total, int grain) @@ -530,7 +531,7 @@ void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const ch void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) { - arithmetic_run(src1, dst, "arithm_muls", &pyrlk, (void *)(&scalar)); + arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar)); } void pyrdown_run_cus(const oclMat &src, const oclMat &dst) @@ -581,26 +582,26 @@ void pyrDown_cus(const oclMat &src, oclMat &dst) // //void callF(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask) //{ -// Mat srcTemp; -// Mat dstTemp; -// src.download(srcTemp); -// dst.download(dstTemp); +// Mat srcTemp; +// Mat dstTemp; +// src.download(srcTemp); +// dst.download(dstTemp); // -// int i; -// int j; -// int k; -// for(i = 0; i < srcTemp.rows; i++) -// { -// for(j = 0; j < srcTemp.cols; j++) -// { -// for(k = 0; k < srcTemp.channels(); k++) -// { -// ((float*)dstTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k] = (float)op(((float*)srcTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k]); -// } -// } -// } +// int i; +// int j; +// int k; +// for(i = 0; i < srcTemp.rows; i++) +// { +// for(j = 0; j < srcTemp.cols; j++) +// { +// for(k = 0; k < srcTemp.channels(); k++) +// { +// ((float*)dstTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k] = (float)op(((float*)srcTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k]); +// } +// } +// } // -// dst = dstTemp; +// dst = dstTemp; //} // //static inline bool isAligned(const unsigned char* ptr, size_t size) @@ -622,54 +623,54 @@ void pyrDown_cus(const oclMat &src, oclMat &dst) // return; // } // -// Mat srcTemp; -// Mat dstTemp; -// src.download(srcTemp); -// dst.download(dstTemp); +// Mat srcTemp; +// Mat dstTemp; +// src.download(srcTemp); +// dst.download(dstTemp); // -// int x_shifted; +// int x_shifted; // -// int i; -// int j; -// for(i = 0; i < srcTemp.rows; i++) -// { -// const double* srcRow = (const double*)srcTemp.data + i * srcTemp.rows; +// int i; +// int j; +// for(i = 0; i < srcTemp.rows; i++) +// { +// const double* srcRow = (const double*)srcTemp.data + i * srcTemp.rows; // double* dstRow = (double*)dstTemp.data + i * dstTemp.rows;; // -// for(j = 0; j < srcTemp.cols; j++) -// { -// x_shifted = j * 4; +// for(j = 0; j < srcTemp.cols; j++) +// { +// x_shifted = j * 4; // -// if(x_shifted + 4 - 1 < srcTemp.cols) -// { -// dstRow[x_shifted ] = op(srcRow[x_shifted ]); -// dstRow[x_shifted + 1] = op(srcRow[x_shifted + 1]); -// dstRow[x_shifted + 2] = op(srcRow[x_shifted + 2]); -// dstRow[x_shifted + 3] = op(srcRow[x_shifted + 3]); -// } -// else -// { -// for (int real_x = x_shifted; real_x < srcTemp.cols; ++real_x) -// { -// ((float*)dstTemp.data)[i * srcTemp.rows + real_x] = op(((float*)srcTemp.data)[i * srcTemp.rows + real_x]); -// } -// } -// } -// } +// if(x_shifted + 4 - 1 < srcTemp.cols) +// { +// dstRow[x_shifted ] = op(srcRow[x_shifted ]); +// dstRow[x_shifted + 1] = op(srcRow[x_shifted + 1]); +// dstRow[x_shifted + 2] = op(srcRow[x_shifted + 2]); +// dstRow[x_shifted + 3] = op(srcRow[x_shifted + 3]); +// } +// else +// { +// for (int real_x = x_shifted; real_x < srcTemp.cols; ++real_x) +// { +// ((float*)dstTemp.data)[i * srcTemp.rows + real_x] = op(((float*)srcTemp.data)[i * srcTemp.rows + real_x]); +// } +// } +// } +// } //} // //void multiply(const oclMat& src1, double val, oclMat& dst, double scale = 1.0f); //void multiply(const oclMat& src1, double val, oclMat& dst, double scale) //{ // MultiplyScalar op(val, scale); -// //if(src1.channels() == 1 && dst.channels() == 1) -// //{ -// // callT(src1, dst, op, 0); -// //} -// //else -// //{ -// callF(src1, dst, op, 0); -// //} +// //if(src1.channels() == 1 && dst.channels() == 1) +// //{ +// // callT(src1, dst, op, 0); +// //} +// //else +// //{ +// callF(src1, dst, op, 0); +// //} //} cl_mem bindTexture(const oclMat &mat, int depth, int channels) @@ -735,46 +736,69 @@ void releaseTexture(cl_mem texture) } void lkSparse_run(oclMat &I, oclMat &J, - const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat *err, bool GET_MIN_EIGENVALS, int ptcount, + const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, int level, /*dim3 block, */dim3 patch, Size winSize, int iters) { Context *clCxt = I.clCxt; + char platform[256] = {0}; + cl_platform_id pid; + clGetDeviceInfo(*clCxt->impl->devices, CL_DEVICE_PLATFORM, sizeof(pid), &pid, NULL); + clGetPlatformInfo(pid, CL_PLATFORM_NAME, 256, platform, NULL); + std::string namestr = platform; + bool isImageSupported = true; + if(namestr.find("NVIDIA")!=string::npos || namestr.find("Intel")!=string::npos) + isImageSupported = false; + + int elemCntPerRow = I.step / I.elemSize(); string kernelName = "lkSparse"; - size_t localThreads[3] = { 8, 32, 1 }; - size_t globalThreads[3] = { 8 * ptcount, 32, 1}; + + size_t localThreads[3] = { 8, isImageSupported?8:32, 1 }; + size_t globalThreads[3] = { 8 * ptcount, isImageSupported?8:32, 1}; int cn = I.oclchannels(); - bool calcErr; - if (err) + char calcErr; + if (level == 0) { - calcErr = true; + calcErr = 1; } else { - calcErr = false; + calcErr = 0; } - calcErr = true; - - cl_mem ITex = bindTexture(I, I.depth(), cn); - cl_mem JTex = bindTexture(J, J.depth(), cn); vector > args; + cl_mem ITex; + cl_mem JTex; + if (isImageSupported) + { + ITex = bindTexture(I, I.depth(), cn); + JTex = bindTexture(J, J.depth(), cn); + } + else + { + ITex = (cl_mem)I.data; + JTex = (cl_mem)J.data; + } args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex )); - + //cl_mem clmD = clCreateBuffer(clCxt, CL_MEM_READ_WRITE, ptcount * sizeof(float), NULL, NULL); args.push_back( make_pair( sizeof(cl_mem), (void *)&prevPts.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&prevPts.step )); args.push_back( make_pair( sizeof(cl_mem), (void *)&nextPts.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&nextPts.step )); args.push_back( make_pair( sizeof(cl_mem), (void *)&status.data )); - //args.push_back( make_pair( sizeof(cl_mem), (void *)&(err->data) )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&err.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&level )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); + if (!isImageSupported) + { + args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); + } args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x )); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y )); args.push_back( make_pair( sizeof(cl_int), (void *)&cn )); @@ -782,27 +806,29 @@ void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height )); args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - args.push_back( make_pair( sizeof(cl_char), (void *)&GET_MIN_EIGENVALS )); + //args.push_back( make_pair( sizeof(cl_char), (void *)&GET_MIN_EIGENVALS )); - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + if (isImageSupported) + { + openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); - releaseTexture(ITex); - releaseTexture(JTex); + releaseTexture(ITex); + releaseTexture(JTex); + } + else + { + //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n"); + openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + } } void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat *err) { - if (prevImg.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos) - { - cout << " Intel HD GPU device unsupported " << endl; - return; - } - if (prevPts.empty()) { nextPts.release(); status.release(); - if (err) err->release(); + //if (err) err->release(); return; } @@ -836,8 +862,15 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next //status.setTo(Scalar::all(1)); setTo(status, Scalar::all(1)); - //if (err) - // ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); + bool errMat = false; + if (!err) + { + err = new oclMat(1, prevPts.cols, CV_32FC1); + errMat = true; + } + else + ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); + //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err); // build the image pyramids. @@ -872,17 +905,22 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next for (int level = maxLevel; level >= 0; level--) { lkSparse_run(prevPyr_[level], nextPyr_[level], - prevPts, nextPts, status, level == 0 && err ? err : 0, getMinEigenVals, prevPts.cols, + prevPts, nextPts, status, *err, getMinEigenVals, prevPts.cols, level, /*block, */patch, winSize, iters); } clFinish(prevImg.clCxt->impl->clCmdQueue); + + if(errMat) + delete err; } void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) { Context *clCxt = I.clCxt; + bool isImageSupported = clCxt->impl->devName.find("Intel(R) HD Graphics") == string::npos; + int elemCntPerRow = I.step / I.elemSize(); string kernelName = "lkDense"; @@ -901,8 +939,19 @@ void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, calcErr = false; } - cl_mem ITex = bindTexture(I, I.depth(), cn); - cl_mem JTex = bindTexture(J, J.depth(), cn); + cl_mem ITex; + cl_mem JTex; + + if (isImageSupported) + { + ITex = bindTexture(I, I.depth(), cn); + JTex = bindTexture(J, J.depth(), cn); + } + else + { + ITex = (cl_mem)I.data; + JTex = (cl_mem)J.data; + } //int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2}; //const int patchWidth = 16 + 2 * halfWin.x; @@ -926,15 +975,27 @@ void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); //args.push_back( make_pair( sizeof(cl_mem), (void *)&(*err).data )); //args.push_back( make_pair( sizeof(cl_int), (void *)&(*err).step )); + if (!isImageSupported) + { + args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); + } args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.width )); args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height )); args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + if (isImageSupported) + { + openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); - releaseTexture(ITex); - releaseTexture(JTex); + releaseTexture(ITex); + releaseTexture(JTex); + } + else + { + //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n"); + openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + } } void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err) diff --git a/modules/ocl/test/test_pyrlk.cpp b/modules/ocl/test/test_pyrlk.cpp index b594a3483..7c747ee4f 100644 --- a/modules/ocl/test/test_pyrlk.cpp +++ b/modules/ocl/test/test_pyrlk.cpp @@ -118,9 +118,9 @@ TEST_P(Sparse, Mat) cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void *)&status[0]); d_status.download(status_mat); - //std::vector err(d_err.cols); - //cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]); - //d_err.download(err_mat); + std::vector err(d_err.cols); + cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]); + d_err.download(err_mat); std::vector nextPts_gold; std::vector status_gold; @@ -153,9 +153,9 @@ TEST_P(Sparse, Mat) } } - double bad_ratio = static_cast(mistmatch) / (nextPts.size() * 2); + double bad_ratio = static_cast(mistmatch) / (nextPts.size()); - ASSERT_LE(bad_ratio, 0.05f); + ASSERT_LE(bad_ratio, 0.02f); }