Merge pull request #2155 from vbystricky:ocl_calcOpticalFlowPyrLK

2014-01-25 17:08:06 +04:00 · 2014-01-25 17:08:06 +04:00 · a64d3c1744
commit a64d3c1744
parent 1db8aa8628 bb09d44e0a
6 changed files with 1283 additions and 2 deletions
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -60,6 +60,7 @@ class CV_EXPORTS Program;
 class CV_EXPORTS ProgramSource2;
 class CV_EXPORTS Queue;
 class CV_EXPORTS PlatformInfo2;
+class CV_EXPORTS Image2D;

 class CV_EXPORTS Device
 {
@ -89,6 +90,7 @@ public:
    String vendor() const;
    String OpenCL_C_Version() const;
    String OpenCLVersion() const;
+    String deviceVersion() const;
    String driverVersion() const;
    void* ptr() const;

@ -325,6 +327,7 @@ public:
                const String& buildopts, String* errmsg=0);

    int set(int i, const void* value, size_t sz);
+    int set(int i, const Image2D& image2D);
    int set(int i, const UMat& m);
    int set(int i, const KernelArg& arg);
    template<typename _Tp> int set(int i, const _Tp& value)
@ -574,6 +577,19 @@ CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
 CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo2>& platform_info);

+class CV_EXPORTS Image2D
+{
+public:
+    Image2D();
+    Image2D(const UMat &src);
+    ~Image2D();
+
+    void* ptr() const;
+protected:
+    struct Impl;
+    Impl* p;
+};
+
 }}

 #endif
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -821,6 +821,7 @@ OCL_FUNC_P(cl_mem, clCreateSubBuffer,
    const void * buffer_create_info,
    cl_int * errcode_ret),
    (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
+*/

 OCL_FUNC_P(cl_mem, clCreateImage,
    (cl_context context,
@ -831,6 +832,18 @@ OCL_FUNC_P(cl_mem, clCreateImage,
    cl_int * errcode_ret),
    (context, flags, image_format, image_desc, host_ptr, errcode_ret))

+OCL_FUNC_P(cl_mem, clCreateImage2D,
+    (cl_context context,
+    cl_mem_flags flags,
+    const cl_image_format * image_format,
+    size_t image_width,
+    size_t image_height,
+    size_t image_row_pitch,
+    void * host_ptr,
+    cl_int *errcode_ret),
+    (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
+
+/*
 OCL_FUNC(cl_int, clGetSupportedImageFormats,
 (cl_context context,
 cl_mem_flags flags,
@ -945,21 +958,26 @@ OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
 cl_event * event),
 (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
 num_events_in_wait_list, event_wait_list, event))
+*/

 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
 (cl_command_queue command_queue,
 cl_mem src_buffer,
 cl_mem dst_image,
 size_t src_offset,
- const size_t * dst_origin[3],
- const size_t * region[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
 cl_uint num_events_in_wait_list,
 const cl_event * event_wait_list,
 cl_event * event),
 (command_queue, src_buffer, dst_image, src_offset, dst_origin,
 region, num_events_in_wait_list, event_wait_list, event))

+ OCL_FUNC(cl_int, clFlush,
+ (cl_command_queue command_queue),
+ (command_queue))

+/*
 OCL_FUNC_P(void*, clEnqueueMapImage,
 (cl_command_queue command_queue,
 cl_mem image,
@ -976,7 +994,9 @@ OCL_FUNC_P(void*, clEnqueueMapImage,
 (command_queue, image, blocking_map, map_flags, origin, region,
 image_row_pitch, image_slice_pitch, num_events_in_wait_list,
 event_wait_list, event, errcode_ret))
+*/

+/*
 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))

 OCL_FUNC(cl_int, clGetKernelInfo,
@ -1705,6 +1725,9 @@ String Device::OpenCL_C_Version() const
 String Device::OpenCLVersion() const
 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }

+String Device::deviceVersion() const
+{ return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
+
 String Device::driverVersion() const
 { return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); }

@ -2689,6 +2712,12 @@ int Kernel::set(int i, const void* value, size_t sz)
    return i+1;
 }

+int Kernel::set(int i, const Image2D& image2D)
+{
+    cl_mem h = (cl_mem)image2D.ptr();
+    return set(i, &h, sizeof(h));
+}
+
 int Kernel::set(int i, const UMat& m)
 {
    return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
@ -3785,4 +3814,151 @@ const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
    return buf;
 }

+///////////////////////////////////////////////////////////////////////////////////////////////
+// deviceVersion has format
+//   OpenCL<space><major_version.minor_version><space><vendor-specific information>
+// by specification
+//   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
+//   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
+static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
+{
+    major = minor = 0;
+    if (10 >= deviceVersion.length())
+        return;
+    const char *pstr = deviceVersion.c_str();
+    if (0 != strncmp(pstr, "OpenCL ", 7))
+        return;
+    size_t ppos = deviceVersion.find('.', 7);
+    if (String::npos == ppos)
+        return;
+    String temp = deviceVersion.substr(7, ppos - 7);
+    major = atoi(temp.c_str());
+    temp = deviceVersion.substr(ppos + 1);
+    minor = atoi(temp.c_str());
+}
+
+struct Image2D::Impl
+{
+    Impl(const UMat &src)
+    {
+        handle = 0;
+        refcount = 1;
+        init(src);
+    }
+    ~Impl()
+    {
+        if (handle)
+            clReleaseMemObject(handle);
+    }
+    void init(const UMat &src)
+    {
+        cl_image_format format;
+        int err;
+        int depth    = src.depth();
+        int channels = src.channels();
+
+        switch(depth)
+        {
+        case CV_8U:
+            format.image_channel_data_type = CL_UNSIGNED_INT8;
+            break;
+        case CV_32S:
+            format.image_channel_data_type = CL_UNSIGNED_INT32;
+            break;
+        case CV_32F:
+            format.image_channel_data_type = CL_FLOAT;
+            break;
+        default:
+            CV_Error(-1, "Image forma is not supported");
+            break;
+        }
+        switch(channels)
+        {
+        case 1:
+            format.image_channel_order     = CL_R;
+            break;
+        case 3:
+            format.image_channel_order     = CL_RGB;
+            break;
+        case 4:
+            format.image_channel_order     = CL_RGBA;
+            break;
+        default:
+            CV_Error(-1, "Image format is not supported");
+            break;
+        }
+#ifdef CL_VERSION_1_2
+        //this enables backwards portability to
+        //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
+        int minor, major;
+        parseDeviceVersion(Device::getDefault().deviceVersion(), major, minor);
+        if ((1 < major) || ((1 == major) && (2 <= minor)))
+        {
+            cl_image_desc desc;
+            desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width      = src.cols;
+            desc.image_height     = src.rows;
+            desc.image_depth      = 0;
+            desc.image_array_size = 1;
+            desc.image_row_pitch  = 0;
+            desc.image_slice_pitch = 0;
+            desc.buffer           = NULL;
+            desc.num_mip_levels   = 0;
+            desc.num_samples      = 0;
+            handle = clCreateImage((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+        }
+        else
+#endif
+        {
+            handle = clCreateImage2D((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
+        }
+        size_t origin[] = { 0, 0, 0 };
+        size_t region[] = { src.cols, src.rows, 1 };
+
+        cl_mem devData;
+        if (!src.isContinuous())
+        {
+            devData = clCreateBuffer((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, NULL);
+            const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1};
+            clEnqueueCopyBufferRect((cl_command_queue)Queue::getDefault().ptr(), (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
+                roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL);
+            clFlush((cl_command_queue)Queue::getDefault().ptr());
+        }
+        else
+        {
+            devData = (cl_mem)src.handle(ACCESS_READ);
+        }
+
+        clEnqueueCopyBufferToImage((cl_command_queue)Queue::getDefault().ptr(), devData, handle, 0, origin, region, 0, NULL, 0);
+        if (!src.isContinuous())
+        {
+            clFlush((cl_command_queue)Queue::getDefault().ptr());
+            clReleaseMemObject(devData);
+        }
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_mem handle;
+};
+
+Image2D::Image2D()
+{
+    p = NULL;
+}
+Image2D::Image2D(const UMat &src)
+{
+    p = new Impl(src);
+}
+Image2D::~Image2D()
+{
+    if (p)
+        p->release();
+}
+
+void* Image2D::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
 }}
--- a/modules/video/perf/opencl/perf_optflow_pyrlk.cpp
+++ b/modules/video/perf/opencl/perf_optflow_pyrlk.cpp
@ -0,0 +1,102 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+using std::tr1::make_tuple;
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// FarnebackOpticalFlow ////////////////////////
+CV_ENUM(farneFlagType, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
+
+typedef tuple< int > PyrLKOpticalFlowParams;
+typedef TestBaseWithParam<PyrLKOpticalFlowParams> PyrLKOpticalFlowFixture;
+
+OCL_PERF_TEST_P(PyrLKOpticalFlowFixture, PyrLKOpticalFlow,
+                ::testing::Values(1000, 2000, 4000)
+                )
+{
+    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
+
+    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
+
+    UMat uFrame0; frame0.copyTo(uFrame0);
+    UMat uFrame1; frame1.copyTo(uFrame1);
+
+    const Size winSize = Size(21, 21);
+    const int maxLevel = 3;
+    const TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01);
+    const int flags = 0;
+    const float minEigThreshold = 1e-4f;
+    const double eps = 1.0;
+
+    const PyrLKOpticalFlowParams params = GetParam();
+    const int pointsCount = get<0>(params);
+
+    vector<Point2f> pts, nextPts;
+    vector<unsigned char> status;
+    vector<float> err;
+    goodFeaturesToTrack(frame0, pts, pointsCount, 0.01, 0.0);
+    Mat ptsMat(1, static_cast<int>(pts.size()), CV_32FC2, (void *)&pts[0]);
+
+    declare.in(uFrame0, uFrame1, WARMUP_READ);
+    UMat uNextPts, uStatus, uErr;
+    OCL_TEST_CYCLE()
+        cv::calcOpticalFlowPyrLK(uFrame0, uFrame1, pts, uNextPts, uStatus, uErr, winSize, maxLevel, criteria, flags, minEigThreshold);
+
+    SANITY_CHECK(uNextPts, eps);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@ -43,6 +43,7 @@
 #include <float.h>
 #include <stdio.h>
 #include "lkpyramid.hpp"
+#include "opencl_kernels.hpp"

 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))

@ -590,6 +591,262 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si
    return maxLevel;
 }

+namespace cv
+{
+    class PyrLKOpticalFlow
+    {
+        struct dim3
+        {
+            unsigned int x, y, z;
+        };
+    public:
+        PyrLKOpticalFlow()
+        {
+            winSize = Size(21, 21);
+            maxLevel = 3;
+            iters = 30;
+            derivLambda = 0.5;
+            useInitialFlow = false;
+        }
+
+        bool checkParam()
+        {
+            iters = std::min(std::max(iters, 0), 100);
+
+            derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
+            if (derivLambda < 0)
+                return false;
+            if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2)
+                return false;
+            calcPatchSize();
+            if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6)
+                return false;
+            if (!initWaveSize())
+                return false;
+            return true;
+        }
+
+        bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err)
+        {
+            if (!checkParam())
+                return false;
+
+            UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
+            UMat temp2 = nextPts.reshape(1);
+            multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2);
+
+            status.setTo(Scalar::all(1));
+
+            // build the image pyramids.
+            std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
+            std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
+
+            prevImg.convertTo(prevPyr[0], CV_32F);
+            nextImg.convertTo(nextPyr[0], CV_32F);
+
+            for (int level = 1; level <= maxLevel; ++level)
+            {
+                pyrDown(prevPyr[level - 1], prevPyr[level]);
+                pyrDown(nextPyr[level - 1], nextPyr[level]);
+            }
+
+            // dI/dx ~ Ix, dI/dy ~ Iy
+            for (int level = maxLevel; level >= 0; level--)
+            {
+                if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts,
+                                  nextPts, status, err,
+                                  prevPts.cols, level))
+                    return false;
+            }
+            return true;
+        }
+
+        Size winSize;
+        int maxLevel;
+        int iters;
+        double derivLambda;
+        bool useInitialFlow;
+
+    private:
+        int waveSize;
+        bool initWaveSize()
+        {
+            waveSize = 1;
+            if (isDeviceCPU())
+                return true;
+
+            ocl::Kernel kernel;
+            if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, ""))
+                return false;
+            waveSize = (int)kernel.preferedWorkGroupSizeMultiple();
+            return true;
+        }
+        dim3 patch;
+        void calcPatchSize()
+        {
+            dim3 block;
+
+            if (winSize.width > 32 && winSize.width > 2 * winSize.height)
+            {
+                block.x = 32;
+                block.y = 8;
+            }
+            else
+            {
+                block.x = 16;
+                block.y = 16;
+            }
+
+            patch.x = (winSize.width  + block.x - 1) / block.x;
+            patch.y = (winSize.height + block.y - 1) / block.y;
+
+            block.z = patch.z = 1;
+        }
+
+        #define SAFE_KERNEL_SET_ARG(idx, arg) \
+        {\
+            int idxNew = kernel.set(idx, arg);\
+            if (-1 == idxNew)\
+            {\
+                printf("lkSparse_run can't setup argument index = %d to kernel\n", idx);\
+                return false;\
+            }\
+            idx = idxNew;\
+        }
+        bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
+            int ptcount, int level)
+        {
+            size_t localThreads[3]  = { 8, 8};
+            size_t globalThreads[3] = { 8 * ptcount, 8};
+            char calcErr = (0 == level) ? 1 : 0;
+
+            cv::String build_options;
+            if (isDeviceCPU())
+                build_options = " -D CPU";
+            else
+                build_options = cv::format("-D WAVE_SIZE=%d", waveSize);
+
+            ocl::Kernel kernel;
+            if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
+                return false;
+
+            ocl::Image2D imageI(I);
+            ocl::Image2D imageJ(J);
+            int idxArg = 0;
+#if 0
+            idxArg = kernel.set(idxArg, imageI); //image2d_t I
+            idxArg = kernel.set(idxArg, imageJ); //image2d_t J
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
+            idxArg = kernel.set(idxArg, (int)prevPts.step); // int prevPtsStep
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
+            idxArg = kernel.set(idxArg, (int)nextPts.step); //  int nextPtsStep
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
+            idxArg = kernel.set(idxArg, (int)level); // const int level
+            idxArg = kernel.set(idxArg, (int)I.rows); // const int rows
+            idxArg = kernel.set(idxArg, (int)I.cols); // const int cols
+            idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X
+            idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y
+            idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x
+            idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
+            idxArg = kernel.set(idxArg, (int)iters); // int c_iters
+            idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
+#else
+            SAFE_KERNEL_SET_ARG(idxArg, imageI); //image2d_t I
+            SAFE_KERNEL_SET_ARG(idxArg, imageJ); //image2d_t J
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
+            SAFE_KERNEL_SET_ARG(idxArg, (int)prevPts.step); // int prevPtsStep
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
+            SAFE_KERNEL_SET_ARG(idxArg, (int)nextPts.step); //  int nextPtsStep
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
+            SAFE_KERNEL_SET_ARG(idxArg, (int)level); // const int level
+            SAFE_KERNEL_SET_ARG(idxArg, (int)I.rows); // const int rows
+            SAFE_KERNEL_SET_ARG(idxArg, (int)I.cols); // const int cols
+            SAFE_KERNEL_SET_ARG(idxArg, (int)patch.x); // int PATCH_X
+            SAFE_KERNEL_SET_ARG(idxArg, (int)patch.y); // int PATCH_Y
+            SAFE_KERNEL_SET_ARG(idxArg, (int)winSize.width); // int c_winSize_x
+            SAFE_KERNEL_SET_ARG(idxArg, (int)winSize.height); // int c_winSize_y
+            SAFE_KERNEL_SET_ARG(idxArg, (int)iters); // int c_iters
+            SAFE_KERNEL_SET_ARG(idxArg, (char)calcErr); //char calcErr
+#endif
+
+            return kernel.run(2, globalThreads, localThreads, true);
+        }
+    private:
+        inline static bool isDeviceCPU()
+        {
+            return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type());
+        }
+    };
+
+
+    static bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg,
+                                  InputArray _prevPts, InputOutputArray _nextPts,
+                                  OutputArray _status, OutputArray _err,
+                                  Size winSize, int maxLevel,
+                                  TermCriteria criteria,
+                                  int flags/*, double minEigThreshold*/ )
+    {
+        if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags))
+            return false;
+        if (!cv::ocl::Device::getDefault().imageSupport())
+            return false;
+        if (_nextImg.size() != _prevImg.size())
+            return false;
+        int typePrev = _prevImg.type();
+        int typeNext = _nextImg.type();
+        if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
+            return false;
+        if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext)))
+            return false;
+
+        if (_prevPts.empty() || _prevPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
+            return false;
+        if ((1 != _prevPts.size().height) && (1 != _prevPts.size().width))
+            return false;
+        size_t npoints = _prevPts.total();
+        bool useInitialFlow  = (0 != (flags & OPTFLOW_USE_INITIAL_FLOW));
+        if (useInitialFlow)
+        {
+            if (_nextPts.empty() || _nextPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
+                return false;
+            if ((1 != _nextPts.size().height) && (1 != _nextPts.size().width))
+                return false;
+            if (_nextPts.total() != npoints)
+                return false;
+        }
+        else
+        {
+            _nextPts.create(_prevPts.size(), _prevPts.type());
+        }
+
+        PyrLKOpticalFlow opticalFlow;
+        opticalFlow.winSize     = winSize;
+        opticalFlow.maxLevel    = maxLevel;
+        opticalFlow.iters       = criteria.maxCount;
+        opticalFlow.derivLambda = criteria.epsilon;
+        opticalFlow.useInitialFlow  = useInitialFlow;
+
+        if (!opticalFlow.checkParam())
+            return false;
+
+        UMat umatErr;
+        if (_err.needed())
+        {
+            _err.create((int)npoints, 1, CV_32FC1);
+            umatErr = _err.getUMat();
+        }
+        else
+            umatErr.create((int)npoints, 1, CV_32FC1);
+
+        _status.create((int)npoints, 1, CV_8UC1);
+        UMat umatNextPts = _nextPts.getUMat();
+        UMat umatStatus = _status.getUMat();
+        return opticalFlow.sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr);
+    }
+};
+
 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                           InputArray _prevPts, InputOutputArray _nextPts,
                           OutputArray _status, OutputArray _err,
@ -597,6 +854,10 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                           TermCriteria criteria,
                           int flags, double minEigThreshold )
 {
+    bool use_opencl = ocl::useOpenCL() && (_prevImg.isUMat() || _nextImg.isUMat());
+    if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/))
+        return;
+
    Mat prevPtsMat = _prevPts.getMat();
    const int derivDepth = DataType<cv::detail::deriv_type>::depth;

--- a/modules/video/src/opencl/pyrlk.cl
+++ b/modules/video/src/opencl/pyrlk.cl
@ -0,0 +1,583 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+//    Xiaopeng Fu, fuxiaopeng2222@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define	BUFFER	64
+#define	BUFFER2	BUFFER>>1
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+#ifdef CPU
+
+inline void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    smem3[tid] = val3;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = BUFFER2; i > 0; i >>= 1)
+    {
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+            smem2[tid] += smem2[tid + i];
+            smem3[tid] += smem3[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = BUFFER2; i > 0; i >>= 1)
+    {
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+            smem2[tid] += smem2[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+inline void reduce1(float val1, volatile __local float* smem1, int tid)
+{
+    smem1[tid] = val1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = BUFFER2; i > 0; i >>= 1)
+    {
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+#else
+inline void reduce3(float val1, float val2, float val3,
+             __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    smem3[tid] = val3;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
+#if WAVE_SIZE <16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
+
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
+
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
+
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+        smem3[tid] += smem3[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+#if WAVE_SIZE <16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void reduce1(float val1, __local volatile float* smem1, int tid)
+{
+    smem1[tid] = val1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem1[tid] += smem1[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+#endif
+        smem1[tid] += smem1[tid + 16];
+#if WAVE_SIZE <16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem1[tid] += smem1[tid + 4];
+        smem1[tid] += smem1[tid + 2];
+        smem1[tid] += smem1[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+#endif
+
+#define SCALE (1.0f / (1 << 20))
+#define	THRESHOLD	0.01f
+
+// Image read mode
+__constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+inline void SetPatch(image2d_t I, float x, float y,
+              float* Pch, float* Dx, float* Dy,
+              float* A11, float* A12, float* A22)
+{
+    *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
+
+    float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
+
+    float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
+
+
+    *Dx = dIdx;
+    *Dy = dIdy;
+
+    *A11 += dIdx * dIdx;
+    *A12 += dIdx * dIdy;
+    *A22 += dIdy * dIdy;
+}
+
+inline void GetPatch(image2d_t J, float x, float y,
+              float* Pch, float* Dx, float* Dy,
+              float* b1, float* b2)
+{
+    float J_val = read_imagef(J, sampler, (float2)(x, y)).x;
+    float diff = (J_val - *Pch) * 32.0f;
+    *b1 += diff**Dx;
+    *b2 += diff**Dy;
+}
+
+inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
+{
+    float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
+    *errval += fabs(diff);
+}
+
+inline void SetPatch4(image2d_t I, const float x, const float y,
+               float4* Pch, float4* Dx, float4* Dy,
+               float* A11, float* A12, float* A22)
+{
+    *Pch = read_imagef(I, sampler, (float2)(x, y));
+
+    float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
+                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
+
+    float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
+                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
+
+
+    *Dx = dIdx;
+    *Dy = dIdy;
+    float4 sqIdx = dIdx * dIdx;
+    *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
+    sqIdx = dIdx * dIdy;
+    *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
+    sqIdx = dIdy * dIdy;
+    *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
+}
+
+inline void GetPatch4(image2d_t J, const float x, const float y,
+               const float4* Pch, const float4* Dx, const float4* Dy,
+               float* b1, float* b2)
+{
+    float4 J_val = read_imagef(J, sampler, (float2)(x, y));
+    float4 diff = (J_val - *Pch) * 32.0f;
+    float4 xdiff = diff* *Dx;
+    *b1 += xdiff.x + xdiff.y + xdiff.z;
+    xdiff = diff* *Dy;
+    *b2 += xdiff.x + xdiff.y + xdiff.z;
+}
+
+inline void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
+{
+    float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
+    *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
+}
+
+#define	GRIDSIZE	3
+__kernel void lkSparse(image2d_t I, image2d_t J,
+                       __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
+                       const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+{
+    __local float smem1[BUFFER];
+    __local float smem2[BUFFER];
+    __local float smem3[BUFFER];
+
+    unsigned int xid=get_local_id(0);
+    unsigned int yid=get_local_id(1);
+    unsigned int gid=get_group_id(0);
+    unsigned int xsize=get_local_size(0);
+    unsigned int ysize=get_local_size(1);
+    int xBase, yBase, k;
+
+    float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
+
+    const int tid = mad24(yid, xsize, xid);
+
+    float2 prevPt = prevPts[gid] / (float2)(1 << level);
+
+    if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+    {
+        if (tid == 0 && level == 0)
+        {
+            status[gid] = 0;
+        }
+
+        return;
+    }
+    prevPt -= c_halfWin;
+
+    // extract the patch from the first image, compute covariation matrix of derivatives
+
+    float A11 = 0;
+    float A12 = 0;
+    float A22 = 0;
+
+    float I_patch[GRIDSIZE][GRIDSIZE];
+    float dIdx_patch[GRIDSIZE][GRIDSIZE];
+    float dIdy_patch[GRIDSIZE][GRIDSIZE];
+
+    yBase=yid;
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
+                 &A11, &A12, &A22);
+
+
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
+                 &A11, &A12, &A22);
+
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                     &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
+                 &A11, &A12, &A22);
+
+
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
+                 &A11, &A12, &A22);
+
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                     &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    if(yBase<c_winSize_y)
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
+                 &A11, &A12, &A22);
+
+
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
+                 &A11, &A12, &A22);
+
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                     &A11, &A12, &A22);
+    }
+
+    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
+
+    A11 = smem1[0];
+    A12 = smem2[0];
+    A22 = smem3[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float D = A11 * A22 - A12 * A12;
+
+    if (D < 1.192092896e-07f)
+    {
+        if (tid == 0 && level == 0)
+            status[gid] = 0;
+
+        return;
+    }
+
+    A11 /= D;
+    A12 /= D;
+    A22 /= D;
+
+    prevPt = nextPts[gid] * 2.0f - c_halfWin;
+
+    for (k = 0; k < c_iters; ++k)
+    {
+        if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[gid] = 0;
+            return;
+        }
+
+        float b1 = 0;
+        float b2 = 0;
+
+        yBase=yid;
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
+                     &b1, &b2);
+
+
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
+                     &b1, &b2);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                         &b1, &b2);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
+                     &b1, &b2);
+
+
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
+                     &b1, &b2);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                         &b1, &b2);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
+                     &b1, &b2);
+
+
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
+                     &b1, &b2);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                         &b1, &b2);
+        }
+
+        reduce2(b1, b2, smem1, smem2, tid);
+
+        b1 = smem1[0];
+        b2 = smem2[0];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        float2 delta;
+        delta.x = A12 * b2 - A22 * b1;
+        delta.y = A12 * b1 - A11 * b2;
+
+        prevPt += delta;
+
+        if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
+            break;
+    }
+
+    D = 0.0f;
+    if (calcErr)
+    {
+        yBase=yid;
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][0], &D);
+
+
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][1], &D);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[0][2], &D);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][0], &D);
+
+
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][1], &D);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[1][2], &D);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][0], &D);
+
+
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][1], &D);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[2][2], &D);
+        }
+
+        reduce1(D, smem1, tid);
+    }
+
+    if (tid == 0)
+    {
+        prevPt += c_halfWin;
+
+        nextPts[gid] = prevPt;
+
+        if (calcErr)
+            err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
+    }
+}
--- a/modules/video/test/ocl/test_optflowpyrlk.cpp
+++ b/modules/video/test/ocl/test_optflowpyrlk.cpp
@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+
+#ifdef HAVE_OPENCL
+
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
+PARAM_TEST_CASE(PyrLKOpticalFlow, int, int)
+{
+    Size winSize;
+    int maxLevel;
+    TermCriteria criteria;
+    int flags;
+    double minEigThreshold;
+
+    virtual void SetUp()
+    {
+        winSize = Size(GET_PARAM(0), GET_PARAM(0));
+        maxLevel = GET_PARAM(1);
+        criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01);
+        flags = 0;
+        minEigThreshold = 1e-4f;
+    }
+};
+
+OCL_TEST_P(PyrLKOpticalFlow, Mat)
+{
+    static const int npoints = 1000;
+    static const float eps = 0.03f;
+
+    cv::Mat frame0 = readImage("optflow/RubberWhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+    UMat umatFrame0; frame0.copyTo(umatFrame0);
+
+    cv::Mat frame1 = readImage("optflow/RubberWhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    UMat umatFrame1; frame1.copyTo(umatFrame1);
+
+    std::vector<cv::Point2f> pts;
+    cv::goodFeaturesToTrack(frame0, pts, npoints, 0.01, 0.0);
+
+    std::vector<cv::Point2f> cpuNextPts;
+    std::vector<unsigned char> cpuStatusCPU;
+    std::vector<float> cpuErr;
+    OCL_OFF(cv::calcOpticalFlowPyrLK(frame0, frame1, pts, cpuNextPts, cpuStatusCPU, cpuErr, winSize, maxLevel, criteria, flags, minEigThreshold));
+
+    UMat umatNextPts, umatStatus, umatErr;
+    OCL_ON(cv::calcOpticalFlowPyrLK(umatFrame0, umatFrame1, pts, umatNextPts, umatStatus, umatErr, winSize, maxLevel, criteria, flags, minEigThreshold));
+    std::vector<cv::Point2f> nextPts; umatNextPts.reshape(2, 1).copyTo(nextPts);
+    std::vector<unsigned char> status; umatStatus.reshape(1, 1).copyTo(status);
+    std::vector<float> err; umatErr.reshape(1, 1).copyTo(err);
+
+    ASSERT_EQ(cpuNextPts.size(), nextPts.size());
+    ASSERT_EQ(cpuStatusCPU.size(), status.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < nextPts.size(); ++i)
+    {
+        if (status[i] != cpuStatusCPU[i])
+        {
+            ++mistmatch;
+            continue;
+        }
+
+        if (status[i])
+        {
+            cv::Point2i a = nextPts[i];
+            cv::Point2i b = cpuNextPts[i];
+
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+            float errdiff = 0.0f;
+
+            if (!eq || errdiff > 1e-1)
+                ++mistmatch;
+        }
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / (nextPts.size());
+
+    ASSERT_LE(bad_ratio, eps);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlow,
+                            Combine(
+                                Values(21, 25),
+                                Values(3, 5)
+                                )
+                           );
+
+} } // namespace cvtest::ocl
+
+
+#endif // HAVE_OPENCL