diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 12c7b8b06..850a2e60e 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -60,6 +60,7 @@ class CV_EXPORTS Program; class CV_EXPORTS ProgramSource2; class CV_EXPORTS Queue; class CV_EXPORTS PlatformInfo2; +class CV_EXPORTS Image2D; class CV_EXPORTS Device { @@ -89,6 +90,7 @@ public: String vendor() const; String OpenCL_C_Version() const; String OpenCLVersion() const; + String deviceVersion() const; String driverVersion() const; void* ptr() const; @@ -325,6 +327,7 @@ public: const String& buildopts, String* errmsg=0); int set(int i, const void* value, size_t sz); + int set(int i, const Image2D& image2D); int set(int i, const UMat& m); int set(int i, const KernelArg& arg); template int set(int i, const _Tp& value) @@ -574,6 +577,19 @@ CV_EXPORTS const char* typeToStr(int t); CV_EXPORTS const char* memopTypeToStr(int t); CV_EXPORTS void getPlatfomsInfo(std::vector& platform_info); +class CV_EXPORTS Image2D +{ +public: + Image2D(); + Image2D(const UMat &src); + ~Image2D(); + + void* ptr() const; +protected: + struct Impl; + Impl* p; +}; + }} #endif diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 73980629a..f103cd99d 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -1705,6 +1705,9 @@ String Device::OpenCL_C_Version() const String Device::OpenCLVersion() const { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } +String Device::deviceVersion() const +{ return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); } + String Device::driverVersion() const { return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); } @@ -2689,6 +2692,12 @@ int Kernel::set(int i, const void* value, size_t sz) return i+1; } +int Kernel::set(int i, const Image2D& image2D) +{ + cl_mem h = (cl_mem)image2D.ptr(); + return set(i, &h, sizeof(h)); +} + int Kernel::set(int i, const UMat& m) { return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); @@ -3785,4 +3794,149 @@ const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) return buf; } +/////////////////////////////////////////////////////////////////////////////////////////////// +// deviceVersion has format +// OpenCL +// by specification +// http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html +// http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html +static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor) +{ + major = minor = 0; + if (10 >= deviceVersion.length()) + return; + const char *pstr = deviceVersion.c_str(); + if (0 != strncmp(pstr, "OpenCL ", 7)) + return; + size_t ppos = deviceVersion.find('.', 7); + if (String::npos == ppos) + return; + String temp = deviceVersion.substr(7, ppos - 7); + major = atoi(temp.c_str()); + temp = deviceVersion.substr(ppos + 1); + minor = atoi(temp.c_str()); +} + +struct Image2D::Impl +{ + Impl(const UMat &src) + { + init(src); + } + ~Impl() + { + if (handle) + clReleaseMemObject(handle); + } + void init(const UMat &src) + { + cl_image_format format; + int err; + int depth = src.depth(); + int channels = src.channels(); + + switch(depth) + { + case CV_8U: + format.image_channel_data_type = CL_UNSIGNED_INT8; + break; + case CV_32S: + format.image_channel_data_type = CL_UNSIGNED_INT32; + break; + case CV_32F: + format.image_channel_data_type = CL_FLOAT; + break; + default: + CV_Error(-1, "Image forma is not supported"); + break; + } + switch(channels) + { + case 1: + format.image_channel_order = CL_R; + break; + case 3: + format.image_channel_order = CL_RGB; + break; + case 4: + format.image_channel_order = CL_RGBA; + break; + default: + CV_Error(-1, "Image format is not supported"); + break; + } +#ifdef CL_VERSION_1_2 + //this enables backwards portability to + //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support + int minor, major; + parseDeviceVersion(Device::getDefault().deviceVersion(), major, minor); + if ((1 < major) || ((1 == major) && (2 <= minor))) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = src.cols; + desc.image_height = src.rows; + desc.image_depth = 0; + desc.image_array_size = 1; + desc.image_row_pitch = 0; + desc.image_slice_pitch = 0; + desc.buffer = NULL; + desc.num_mip_levels = 0; + desc.num_samples = 0; + handle = clCreateImage((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err); + } + else +#endif + { + handle = clCreateImage2D((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err); + } + size_t origin[] = { 0, 0, 0 }; + size_t region[] = { src.cols, src.rows, 1 }; + + cl_mem devData; + if (!src.isContinuous()) + { + devData = clCreateBuffer((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, NULL); + const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1}; + clEnqueueCopyBufferRect((cl_command_queue)Queue::getDefault().ptr(), (cl_mem)src.handle(ACCESS_READ), devData, origin, origin, + roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL); + clFlush((cl_command_queue)Queue::getDefault().ptr()); + } + else + { + devData = (cl_mem)src.handle(ACCESS_READ); + } + + clEnqueueCopyBufferToImage((cl_command_queue)Queue::getDefault().ptr(), devData, handle, 0, origin, region, 0, NULL, 0); + if (!src.isContinuous()) + { + clFlush((cl_command_queue)Queue::getDefault().ptr()); + clReleaseMemObject(devData); + } + } + + IMPLEMENT_REFCOUNTABLE(); + + cl_mem handle; +}; + +Image2D::Image2D() +{ + p = NULL; +} +Image2D::Image2D(const UMat &src) +{ + p = new Impl(src); +} +Image2D::~Image2D() +{ + if (p) + p->release(); +} + +void* Image2D::ptr() const +{ + return p ? p->handle : 0; +} + }} diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 155737ba7..2cab3237c 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -43,6 +43,7 @@ #include #include #include "lkpyramid.hpp" +#include "opencl_kernels.hpp" #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) @@ -590,6 +591,231 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si return maxLevel; } +namespace cv +{ + class PyrLKOpticalFlow + { + struct dim3 + { + unsigned int x, y, z; + }; + public: + PyrLKOpticalFlow() + { + winSize = Size(21, 21); + maxLevel = 3; + iters = 30; + derivLambda = 0.5; + useInitialFlow = false; + //minEigThreshold = 1e-4f; + //getMinEigenVals = false; + } + + bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err) + { + if (prevPts.empty()) + { + nextPts.release(); + status.release(); + return false; + } + + derivLambda = std::min(std::max(derivLambda, 0.0), 1.0); + if (derivLambda < 0) + return false; + if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2) + return false; + iters = std::min(std::max(iters, 0), 100); + if (prevPts.rows != 1 || prevPts.type() != CV_32FC2) + return false; + + dim3 patch; + calcPatchSize(patch); + if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6) + return false; + if (!initWaveSize()) + return false; + if (useInitialFlow) + { + if (nextPts.size() != prevPts.size() || nextPts.type() != CV_32FC2) + return false; + } + else + ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); + + UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); + UMat temp2 = nextPts.reshape(1); + multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2); + + ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); + status.setTo(Scalar::all(1)); + + ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err); + + // build the image pyramids. + std::vector prevPyr; prevPyr.resize(maxLevel + 1); + std::vector nextPyr; nextPyr.resize(maxLevel + 1); + + prevImg.convertTo(prevPyr[0], CV_32F); + nextImg.convertTo(nextPyr[0], CV_32F); + + for (int level = 1; level <= maxLevel; ++level) + { + pyrDown(prevPyr[level - 1], prevPyr[level]); + pyrDown(nextPyr[level - 1], nextPyr[level]); + } + + // dI/dx ~ Ix, dI/dy ~ Iy + for (int level = maxLevel; level >= 0; level--) + { + lkSparse_run(prevPyr[level], nextPyr[level], + prevPts, nextPts, status, err, prevPts.cols, + level, patch); + } + return true; + } + + Size winSize; + int maxLevel; + int iters; + double derivLambda; + bool useInitialFlow; + //float minEigThreshold; + //bool getMinEigenVals; + + private: + void calcPatchSize(dim3 &patch) + { + dim3 block; + //winSize.width *= cn; + + if (winSize.width > 32 && winSize.width > 2 * winSize.height) + { + block.x = 32; + block.y = 8; + } + else + { + block.x = 16; + block.y = 16; + } + + patch.x = (winSize.width + block.x - 1) / block.x; + patch.y = (winSize.height + block.y - 1) / block.y; + + block.z = patch.z = 1; + } + private: + int waveSize; + bool initWaveSize() + { + waveSize = 1; + if (isDeviceCPU()) + return true; + + ocl::Kernel kernel; + if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, "")) + return false; + waveSize = (int)kernel.preferedWorkGroupSizeMultiple(); + return true; + } + bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err, + int ptcount, int level, dim3 patch) + { + size_t localThreads[3] = { 8, 8}; + size_t globalThreads[3] = { 8 * ptcount, 8}; + char calcErr = (0 == level) ? 1 : 0; + + cv::String build_options; + if (isDeviceCPU()) + build_options = " -D CPU"; + else + build_options = cv::format("-D WAVE_SIZE=%d", waveSize); + + ocl::Kernel kernel; + if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options)) + return false; + + ocl::Image2D imageI(I); + ocl::Image2D imageJ(J); + int idxArg = 0; + idxArg = kernel.set(idxArg, imageI); //image2d_t I + idxArg = kernel.set(idxArg, imageJ); //image2d_t J + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts + idxArg = kernel.set(idxArg, (int)prevPts.step); // int prevPtsStep + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts + idxArg = kernel.set(idxArg, (int)nextPts.step); // int nextPtsStep + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err + idxArg = kernel.set(idxArg, (int)level); // const int level + idxArg = kernel.set(idxArg, (int)I.rows); // const int rows + idxArg = kernel.set(idxArg, (int)I.cols); // const int cols + idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X + idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y + idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x + idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y + idxArg = kernel.set(idxArg, (int)iters); // int c_iters + idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr + + return kernel.run(2, globalThreads, localThreads, true); + } + private: + inline static bool isDeviceCPU() + { + return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type()); + } + inline static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m) + { + if (m.type() == type && m.rows >= rows && m.cols >= cols) + m = m(Rect(0, 0, cols, rows)); + else + m.create(rows, cols, type); + } + }; + + + bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg, + InputArray _prevPts, InputOutputArray _nextPts, + OutputArray _status, OutputArray _err, + Size winSize, int maxLevel, + TermCriteria criteria, + int flags/*, double minEigThreshold*/ ) + { + if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags)) + return false; + if (!cv::ocl::Device::getDefault().imageSupport()) + return false; + if (_nextImg.size() != _prevImg.size()) + return false; + int typePrev = _prevImg.type(); + int typeNext = _nextImg.type(); + if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext))) + return false; + if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext))) + return false; + + PyrLKOpticalFlow opticalFlow; + opticalFlow.winSize = winSize; + opticalFlow.maxLevel = maxLevel; + opticalFlow.iters = criteria.maxCount; + opticalFlow.derivLambda = criteria.epsilon; + opticalFlow.useInitialFlow = (0 != (flags & OPTFLOW_USE_INITIAL_FLOW)); + + UMat umatErr; + if (_err.needed()) + { + _err.create(_prevPts.size(), CV_8UC1); + umatErr = _err.getUMat(); + } + + _nextPts.create(_prevPts.size(), _prevPts.type()); + _status.create(_prevPts.size(), CV_8UC1); + UMat umatNextPts = _nextPts.getUMat(); + UMat umatStatus = _status.getUMat(); + return opticalFlow.sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr); + } +}; + void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg, InputArray _prevPts, InputOutputArray _nextPts, OutputArray _status, OutputArray _err, @@ -597,6 +823,10 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg, TermCriteria criteria, int flags, double minEigThreshold ) { + bool use_opencl = ocl::useOpenCL() && (_prevImg.isUMat() || _nextImg.isUMat()); + if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/)) + return; + Mat prevPtsMat = _prevPts.getMat(); const int derivDepth = DataType::depth; diff --git a/modules/video/src/opencl/pyrlk.cl b/modules/video/src/opencl/pyrlk.cl new file mode 100644 index 000000000..c01855490 --- /dev/null +++ b/modules/video/src/opencl/pyrlk.cl @@ -0,0 +1,583 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Dachuan Zhao, dachuan@multicorewareinc.com +// Yao Wang, bitwangyaoyao@gmail.com +// Xiaopeng Fu, fuxiaopeng2222@163.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define BUFFER 64 +#define BUFFER2 BUFFER>>1 +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif +#ifdef CPU + +inline void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int i = BUFFER2; i > 0; i >>= 1) + { + if(tid < i) + { + smem1[tid] += smem1[tid + i]; + smem2[tid] += smem2[tid + i]; + smem3[tid] += smem3[tid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int i = BUFFER2; i > 0; i >>= 1) + { + if(tid < i) + { + smem1[tid] += smem1[tid + i]; + smem2[tid] += smem2[tid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} + +inline void reduce1(float val1, volatile __local float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int i = BUFFER2; i > 0; i >>= 1) + { + if(tid < i) + { + smem1[tid] += smem1[tid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} +#else +inline void reduce3(float val1, float val2, float val3, + __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem1[tid] += smem1[tid + 32]; + smem2[tid] += smem2[tid + 32]; + smem3[tid] += smem3[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem1[tid] += smem1[tid + 16]; + smem2[tid] += smem2[tid + 16]; + smem3[tid] += smem3[tid + 16]; +#if WAVE_SIZE <16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; + smem3[tid] += smem3[tid + 8]; + + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; + smem3[tid] += smem3[tid + 4]; + + smem1[tid] += smem1[tid + 2]; + smem2[tid] += smem2[tid + 2]; + smem3[tid] += smem3[tid + 2]; + + smem1[tid] += smem1[tid + 1]; + smem2[tid] += smem2[tid + 1]; + smem3[tid] += smem3[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem1[tid] += smem1[tid + 32]; + smem2[tid] += smem2[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem1[tid] += smem1[tid + 16]; + smem2[tid] += smem2[tid + 16]; +#if WAVE_SIZE <16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; + + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; + + smem1[tid] += smem1[tid + 2]; + smem2[tid] += smem2[tid + 2]; + + smem1[tid] += smem1[tid + 1]; + smem2[tid] += smem2[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +inline void reduce1(float val1, __local volatile float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem1[tid] += smem1[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem1[tid] += smem1[tid + 16]; +#if WAVE_SIZE <16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem1[tid] += smem1[tid + 8]; + smem1[tid] += smem1[tid + 4]; + smem1[tid] += smem1[tid + 2]; + smem1[tid] += smem1[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} +#endif + +#define SCALE (1.0f / (1 << 20)) +#define THRESHOLD 0.01f + +// Image read mode +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; + +inline void SetPatch(image2d_t I, float x, float y, + float* Pch, float* Dx, float* Dy, + float* A11, float* A12, float* A22) +{ + *Pch = read_imagef(I, sampler, (float2)(x, y)).x; + + float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); + + float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); + + + *Dx = dIdx; + *Dy = dIdy; + + *A11 += dIdx * dIdx; + *A12 += dIdx * dIdy; + *A22 += dIdy * dIdy; +} + +inline void GetPatch(image2d_t J, float x, float y, + float* Pch, float* Dx, float* Dy, + float* b1, float* b2) +{ + float J_val = read_imagef(J, sampler, (float2)(x, y)).x; + float diff = (J_val - *Pch) * 32.0f; + *b1 += diff**Dx; + *b2 += diff**Dy; +} + +inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval) +{ + float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; + *errval += fabs(diff); +} + +inline void SetPatch4(image2d_t I, const float x, const float y, + float4* Pch, float4* Dx, float4* Dy, + float* A11, float* A12, float* A22) +{ + *Pch = read_imagef(I, sampler, (float2)(x, y)); + + float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1))); + + float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1))); + + + *Dx = dIdx; + *Dy = dIdy; + float4 sqIdx = dIdx * dIdx; + *A11 += sqIdx.x + sqIdx.y + sqIdx.z; + sqIdx = dIdx * dIdy; + *A12 += sqIdx.x + sqIdx.y + sqIdx.z; + sqIdx = dIdy * dIdy; + *A22 += sqIdx.x + sqIdx.y + sqIdx.z; +} + +inline void GetPatch4(image2d_t J, const float x, const float y, + const float4* Pch, const float4* Dx, const float4* Dy, + float* b1, float* b2) +{ + float4 J_val = read_imagef(J, sampler, (float2)(x, y)); + float4 diff = (J_val - *Pch) * 32.0f; + float4 xdiff = diff* *Dx; + *b1 += xdiff.x + xdiff.y + xdiff.z; + xdiff = diff* *Dy; + *b2 += xdiff.x + xdiff.y + xdiff.z; +} + +inline void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval) +{ + float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch; + *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); +} + +#define GRIDSIZE 3 +__kernel void lkSparse(image2d_t I, image2d_t J, + __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, + const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) +{ + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; + + unsigned int xid=get_local_id(0); + unsigned int yid=get_local_id(1); + unsigned int gid=get_group_id(0); + unsigned int xsize=get_local_size(0); + unsigned int ysize=get_local_size(1); + int xBase, yBase, k; + + float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); + + const int tid = mad24(yid, xsize, xid); + + float2 prevPt = prevPts[gid] / (float2)(1 << level); + + if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + { + if (tid == 0 && level == 0) + { + status[gid] = 0; + } + + return; + } + prevPt -= c_halfWin; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + float I_patch[GRIDSIZE][GRIDSIZE]; + float dIdx_patch[GRIDSIZE][GRIDSIZE]; + float dIdy_patch[GRIDSIZE][GRIDSIZE]; + + yBase=yid; + { + xBase=xid; + SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], + &A11, &A12, &A22); + + + xBase+=xsize; + SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], + &A11, &A12, &A22); + + xBase+=xsize; + if(xBase= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows) + { + if (tid == 0 && level == 0) + status[gid] = 0; + return; + } + + float b1 = 0; + float b2 = 0; + + yBase=yid; + { + xBase=xid; + GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], + &b1, &b2); + + + xBase+=xsize; + GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], + &b1, &b2); + + xBase+=xsize; + if(xBase pts; + cv::goodFeaturesToTrack(frame0, pts, 1000, 0.01, 0.0); + + std::vector nextPtsCPU; + std::vector statusCPU; + std::vector errCPU; + OCL_OFF(cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPtsCPU, statusCPU, errCPU, winSize, maxLevel, criteria, flags, minEigThreshold)); + + UMat umatNextPts, umatStatus, umatErr; + OCL_ON(cv::calcOpticalFlowPyrLK(umatFrame0, umatFrame1, pts, umatNextPts, umatStatus, umatErr, winSize, maxLevel, criteria, flags, minEigThreshold)); + std::vector nextPts(umatNextPts.cols); umatNextPts.copyTo(nextPts); + std::vector status; umatStatus.copyTo(status); + std::vector err; umatErr.copyTo(err); + + ASSERT_EQ(nextPtsCPU.size(), nextPts.size()); + ASSERT_EQ(statusCPU.size(), status.size()); + + size_t mistmatch = 0; + for (size_t i = 0; i < nextPts.size(); ++i) + { + if (status[i] != statusCPU[i]) + { + ++mistmatch; + continue; + } + + if (status[i]) + { + cv::Point2i a = nextPts[i]; + cv::Point2i b = nextPtsCPU[i]; + + bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1; + float errdiff = 0.0f; + + if (!eq || errdiff > 1e-1) + ++mistmatch; + } + } + + double bad_ratio = static_cast(mistmatch) / (nextPts.size()); + + ASSERT_LE(bad_ratio, 0.02f); +} + +OCL_INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlow, + Combine( + Values(21, 25), + Values(3, 5) + ) + ); + +} } // namespace cvtest::ocl + + +#endif // HAVE_OPENCL \ No newline at end of file