diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 12c7b8b06..850a2e60e 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -60,6 +60,7 @@ class CV_EXPORTS Program;
 class CV_EXPORTS ProgramSource2;
 class CV_EXPORTS Queue;
 class CV_EXPORTS PlatformInfo2;
+class CV_EXPORTS Image2D;
 
 class CV_EXPORTS Device
 {
@@ -89,6 +90,7 @@ public:
     String vendor() const;
     String OpenCL_C_Version() const;
     String OpenCLVersion() const;
+    String deviceVersion() const;
     String driverVersion() const;
     void* ptr() const;
 
@@ -325,6 +327,7 @@ public:
                 const String& buildopts, String* errmsg=0);
 
     int set(int i, const void* value, size_t sz);
+    int set(int i, const Image2D& image2D);
     int set(int i, const UMat& m);
     int set(int i, const KernelArg& arg);
     template<typename _Tp> int set(int i, const _Tp& value)
@@ -574,6 +577,19 @@ CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
 CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo2>& platform_info);
 
+class CV_EXPORTS Image2D
+{
+public:
+    Image2D();
+    Image2D(const UMat &src);
+    ~Image2D();
+
+    void* ptr() const;
+protected:
+    struct Impl;
+    Impl* p;
+};
+
 }}
 
 #endif
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 73980629a..f103cd99d 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1705,6 +1705,9 @@ String Device::OpenCL_C_Version() const
 String Device::OpenCLVersion() const
 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
 
+String Device::deviceVersion() const
+{ return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
+
 String Device::driverVersion() const
 { return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); }
 
@@ -2689,6 +2692,12 @@ int Kernel::set(int i, const void* value, size_t sz)
     return i+1;
 }
 
+int Kernel::set(int i, const Image2D& image2D)
+{
+    cl_mem h = (cl_mem)image2D.ptr();
+    return set(i, &h, sizeof(h));
+}
+
 int Kernel::set(int i, const UMat& m)
 {
     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
@@ -3785,4 +3794,149 @@ const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
     return buf;
 }
 
+///////////////////////////////////////////////////////////////////////////////////////////////
+// deviceVersion has format
+//   OpenCL<space><major_version.minor_version><space><vendor-specific information>
+// by specification
+//   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
+//   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
+static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
+{
+    major = minor = 0;
+    if (10 >= deviceVersion.length())
+        return;
+    const char *pstr = deviceVersion.c_str();
+    if (0 != strncmp(pstr, "OpenCL ", 7))
+        return;
+    size_t ppos = deviceVersion.find('.', 7);
+    if (String::npos == ppos)
+        return;
+    String temp = deviceVersion.substr(7, ppos - 7);
+    major = atoi(temp.c_str());
+    temp = deviceVersion.substr(ppos + 1);
+    minor = atoi(temp.c_str());
+}
+
+struct Image2D::Impl
+{
+    Impl(const UMat &src)
+    {
+        init(src);
+    }
+    ~Impl()
+    {
+        if (handle)
+            clReleaseMemObject(handle);
+    }
+    void init(const UMat &src)
+    {
+        cl_image_format format;
+        int err;
+        int depth    = src.depth();
+        int channels = src.channels();
+
+        switch(depth)
+        {
+        case CV_8U:
+            format.image_channel_data_type = CL_UNSIGNED_INT8;
+            break;
+        case CV_32S:
+            format.image_channel_data_type = CL_UNSIGNED_INT32;
+            break;
+        case CV_32F:
+            format.image_channel_data_type = CL_FLOAT;
+            break;
+        default:
+            CV_Error(-1, "Image forma is not supported");
+            break;
+        }
+        switch(channels)
+        {
+        case 1:
+            format.image_channel_order     = CL_R;
+            break;
+        case 3:
+            format.image_channel_order     = CL_RGB;
+            break;
+        case 4:
+            format.image_channel_order     = CL_RGBA;
+            break;
+        default:
+            CV_Error(-1, "Image format is not supported");
+            break;
+        }
+#ifdef CL_VERSION_1_2
+        //this enables backwards portability to
+        //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
+        int minor, major;
+        parseDeviceVersion(Device::getDefault().deviceVersion(), major, minor);
+        if ((1 < major) || ((1 == major) && (2 <= minor)))
+        {
+            cl_image_desc desc;
+            desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width      = src.cols;
+            desc.image_height     = src.rows;
+            desc.image_depth      = 0;
+            desc.image_array_size = 1;
+            desc.image_row_pitch  = 0;
+            desc.image_slice_pitch = 0;
+            desc.buffer           = NULL;
+            desc.num_mip_levels   = 0;
+            desc.num_samples      = 0;
+            handle = clCreateImage((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+        }
+        else
+#endif
+        {
+            handle = clCreateImage2D((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
+        }
+        size_t origin[] = { 0, 0, 0 };
+        size_t region[] = { src.cols, src.rows, 1 };
+
+        cl_mem devData;
+        if (!src.isContinuous())
+        {
+            devData = clCreateBuffer((cl_context)Context2::getDefault().ptr(), CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, NULL);
+            const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1};
+            clEnqueueCopyBufferRect((cl_command_queue)Queue::getDefault().ptr(), (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
+                roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL);
+            clFlush((cl_command_queue)Queue::getDefault().ptr());
+        }
+        else
+        {
+            devData = (cl_mem)src.handle(ACCESS_READ);
+        }
+
+        clEnqueueCopyBufferToImage((cl_command_queue)Queue::getDefault().ptr(), devData, handle, 0, origin, region, 0, NULL, 0);
+        if (!src.isContinuous())
+        {
+            clFlush((cl_command_queue)Queue::getDefault().ptr());
+            clReleaseMemObject(devData);
+        }
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_mem handle;
+};
+
+Image2D::Image2D()
+{
+    p = NULL;
+}
+Image2D::Image2D(const UMat &src)
+{
+    p = new Impl(src);
+}
+Image2D::~Image2D()
+{
+    if (p)
+        p->release();
+}
+
+void* Image2D::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
 }}
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 155737ba7..2cab3237c 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -43,6 +43,7 @@
 #include <float.h>
 #include <stdio.h>
 #include "lkpyramid.hpp"
+#include "opencl_kernels.hpp"
 
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 
@@ -590,6 +591,231 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si
     return maxLevel;
 }
 
+namespace cv
+{
+    class PyrLKOpticalFlow
+    {
+        struct dim3
+        {
+            unsigned int x, y, z;
+        };
+    public:
+        PyrLKOpticalFlow()
+        {
+            winSize = Size(21, 21);
+            maxLevel = 3;
+            iters = 30;
+            derivLambda = 0.5;
+            useInitialFlow = false;
+            //minEigThreshold = 1e-4f;
+            //getMinEigenVals = false;
+        }
+
+        bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err)
+        {
+            if (prevPts.empty())
+            {
+                nextPts.release();
+                status.release();
+                return false;
+            }
+
+            derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
+            if (derivLambda < 0)
+                return false;
+            if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2)
+                return false;
+            iters = std::min(std::max(iters, 0), 100);
+            if (prevPts.rows != 1 || prevPts.type() != CV_32FC2)
+                return false;
+
+            dim3 patch;
+            calcPatchSize(patch);
+            if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6)
+                return false;
+            if (!initWaveSize())
+                return false;
+            if (useInitialFlow)
+            {
+                if (nextPts.size() != prevPts.size() || nextPts.type() != CV_32FC2)
+                    return false;
+            }
+            else
+                ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+
+            UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
+            UMat temp2 = nextPts.reshape(1);
+            multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2);
+
+            ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+            status.setTo(Scalar::all(1));
+
+            ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err);
+
+            // build the image pyramids.
+            std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
+            std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
+
+            prevImg.convertTo(prevPyr[0], CV_32F);
+            nextImg.convertTo(nextPyr[0], CV_32F);
+
+            for (int level = 1; level <= maxLevel; ++level)
+            {
+                pyrDown(prevPyr[level - 1], prevPyr[level]);
+                pyrDown(nextPyr[level - 1], nextPyr[level]);
+            }
+
+            // dI/dx ~ Ix, dI/dy ~ Iy
+            for (int level = maxLevel; level >= 0; level--)
+            {
+                lkSparse_run(prevPyr[level], nextPyr[level],
+                             prevPts, nextPts, status, err, prevPts.cols,
+                             level, patch);
+            }
+            return true;
+        }
+
+        Size winSize;
+        int maxLevel;
+        int iters;
+        double derivLambda;
+        bool useInitialFlow;
+        //float minEigThreshold;
+        //bool getMinEigenVals;
+
+    private:
+        void calcPatchSize(dim3 &patch)
+        {
+            dim3 block;
+            //winSize.width *= cn;
+
+            if (winSize.width > 32 && winSize.width > 2 * winSize.height)
+            {
+                block.x = 32;
+                block.y = 8;
+            }
+            else
+            {
+                block.x = 16;
+                block.y = 16;
+            }
+
+            patch.x = (winSize.width  + block.x - 1) / block.x;
+            patch.y = (winSize.height + block.y - 1) / block.y;
+
+            block.z = patch.z = 1;
+        }
+    private:
+        int waveSize;
+        bool initWaveSize()
+        {
+            waveSize = 1;
+            if (isDeviceCPU())
+                return true;
+
+            ocl::Kernel kernel;
+            if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, ""))
+                return false;
+            waveSize = (int)kernel.preferedWorkGroupSizeMultiple();
+            return true;
+        }
+        bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
+            int ptcount, int level, dim3 patch)
+        {
+            size_t localThreads[3]  = { 8, 8};
+            size_t globalThreads[3] = { 8 * ptcount, 8};
+            char calcErr = (0 == level) ? 1 : 0;
+
+            cv::String build_options;
+            if (isDeviceCPU())
+                build_options = " -D CPU";
+            else
+                build_options = cv::format("-D WAVE_SIZE=%d", waveSize);
+
+            ocl::Kernel kernel;
+            if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
+                return false;
+
+            ocl::Image2D imageI(I);
+            ocl::Image2D imageJ(J);
+            int idxArg = 0;
+            idxArg = kernel.set(idxArg, imageI); //image2d_t I
+            idxArg = kernel.set(idxArg, imageJ); //image2d_t J
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
+            idxArg = kernel.set(idxArg, (int)prevPts.step); // int prevPtsStep
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
+            idxArg = kernel.set(idxArg, (int)nextPts.step); //  int nextPtsStep
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
+            idxArg = kernel.set(idxArg, (int)level); // const int level
+            idxArg = kernel.set(idxArg, (int)I.rows); // const int rows
+            idxArg = kernel.set(idxArg, (int)I.cols); // const int cols
+            idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X
+            idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y
+            idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x
+            idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
+            idxArg = kernel.set(idxArg, (int)iters); // int c_iters
+            idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
+
+            return kernel.run(2, globalThreads, localThreads, true);
+        }
+    private:
+        inline static bool isDeviceCPU()
+        {
+            return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type());
+        }
+        inline static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m)
+        {
+            if (m.type() == type && m.rows >= rows && m.cols >= cols)
+                m = m(Rect(0, 0, cols, rows));
+            else
+                m.create(rows, cols, type);
+        }
+    };
+
+
+    bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg,
+                                  InputArray _prevPts, InputOutputArray _nextPts,
+                                  OutputArray _status, OutputArray _err,
+                                  Size winSize, int maxLevel,
+                                  TermCriteria criteria,
+                                  int flags/*, double minEigThreshold*/ )
+    {
+        if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags))
+            return false;
+        if (!cv::ocl::Device::getDefault().imageSupport())
+            return false;
+        if (_nextImg.size() != _prevImg.size())
+            return false;
+        int typePrev = _prevImg.type();
+        int typeNext = _nextImg.type();
+        if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
+            return false;
+        if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext)))
+            return false;
+
+        PyrLKOpticalFlow opticalFlow;
+        opticalFlow.winSize     = winSize;
+        opticalFlow.maxLevel    = maxLevel;
+        opticalFlow.iters       = criteria.maxCount;
+        opticalFlow.derivLambda = criteria.epsilon;
+        opticalFlow.useInitialFlow  = (0 != (flags & OPTFLOW_USE_INITIAL_FLOW));
+
+        UMat umatErr;
+        if (_err.needed())
+        {
+            _err.create(_prevPts.size(), CV_8UC1);
+            umatErr = _err.getUMat();
+        }
+
+        _nextPts.create(_prevPts.size(), _prevPts.type());
+        _status.create(_prevPts.size(), CV_8UC1);
+        UMat umatNextPts = _nextPts.getUMat();
+        UMat umatStatus = _status.getUMat();
+        return opticalFlow.sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr);
+    }
+};
+
 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                            InputArray _prevPts, InputOutputArray _nextPts,
                            OutputArray _status, OutputArray _err,
@@ -597,6 +823,10 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                            TermCriteria criteria,
                            int flags, double minEigThreshold )
 {
+    bool use_opencl = ocl::useOpenCL() && (_prevImg.isUMat() || _nextImg.isUMat());
+    if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/))
+        return;
+
     Mat prevPtsMat = _prevPts.getMat();
     const int derivDepth = DataType<cv::detail::deriv_type>::depth;
 
diff --git a/modules/video/src/opencl/pyrlk.cl b/modules/video/src/opencl/pyrlk.cl
new file mode 100644
index 000000000..c01855490
--- /dev/null
+++ b/modules/video/src/opencl/pyrlk.cl
@@ -0,0 +1,583 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+//    Xiaopeng Fu, fuxiaopeng2222@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define	BUFFER	64
+#define	BUFFER2	BUFFER>>1
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+#ifdef CPU
+
+inline void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    smem3[tid] = val3;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = BUFFER2; i > 0; i >>= 1)
+    {
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+            smem2[tid] += smem2[tid + i];
+            smem3[tid] += smem3[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = BUFFER2; i > 0; i >>= 1)
+    {
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+            smem2[tid] += smem2[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+inline void reduce1(float val1, volatile __local float* smem1, int tid)
+{
+    smem1[tid] = val1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = BUFFER2; i > 0; i >>= 1)
+    {
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+#else
+inline void reduce3(float val1, float val2, float val3,
+             __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    smem3[tid] = val3;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
+#if WAVE_SIZE <16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
+
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
+
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
+
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+        smem3[tid] += smem3[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+#if WAVE_SIZE <16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void reduce1(float val1, __local volatile float* smem1, int tid)
+{
+    smem1[tid] = val1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem1[tid] += smem1[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+#endif
+        smem1[tid] += smem1[tid + 16];
+#if WAVE_SIZE <16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem1[tid] += smem1[tid + 4];
+        smem1[tid] += smem1[tid + 2];
+        smem1[tid] += smem1[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+#endif
+
+#define SCALE (1.0f / (1 << 20))
+#define	THRESHOLD	0.01f
+
+// Image read mode
+__constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+inline void SetPatch(image2d_t I, float x, float y,
+              float* Pch, float* Dx, float* Dy,
+              float* A11, float* A12, float* A22)
+{
+    *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
+
+    float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
+
+    float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
+
+
+    *Dx = dIdx;
+    *Dy = dIdy;
+
+    *A11 += dIdx * dIdx;
+    *A12 += dIdx * dIdy;
+    *A22 += dIdy * dIdy;
+}
+
+inline void GetPatch(image2d_t J, float x, float y,
+              float* Pch, float* Dx, float* Dy,
+              float* b1, float* b2)
+{
+    float J_val = read_imagef(J, sampler, (float2)(x, y)).x;
+    float diff = (J_val - *Pch) * 32.0f;
+    *b1 += diff**Dx;
+    *b2 += diff**Dy;
+}
+
+inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
+{
+    float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
+    *errval += fabs(diff);
+}
+
+inline void SetPatch4(image2d_t I, const float x, const float y,
+               float4* Pch, float4* Dx, float4* Dy,
+               float* A11, float* A12, float* A22)
+{
+    *Pch = read_imagef(I, sampler, (float2)(x, y));
+
+    float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
+                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
+
+    float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
+                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
+
+
+    *Dx = dIdx;
+    *Dy = dIdy;
+    float4 sqIdx = dIdx * dIdx;
+    *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
+    sqIdx = dIdx * dIdy;
+    *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
+    sqIdx = dIdy * dIdy;
+    *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
+}
+
+inline void GetPatch4(image2d_t J, const float x, const float y,
+               const float4* Pch, const float4* Dx, const float4* Dy,
+               float* b1, float* b2)
+{
+    float4 J_val = read_imagef(J, sampler, (float2)(x, y));
+    float4 diff = (J_val - *Pch) * 32.0f;
+    float4 xdiff = diff* *Dx;
+    *b1 += xdiff.x + xdiff.y + xdiff.z;
+    xdiff = diff* *Dy;
+    *b2 += xdiff.x + xdiff.y + xdiff.z;
+}
+
+inline void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
+{
+    float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
+    *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
+}
+
+#define	GRIDSIZE	3
+__kernel void lkSparse(image2d_t I, image2d_t J,
+                       __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
+                       const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+{
+    __local float smem1[BUFFER];
+    __local float smem2[BUFFER];
+    __local float smem3[BUFFER];
+
+    unsigned int xid=get_local_id(0);
+    unsigned int yid=get_local_id(1);
+    unsigned int gid=get_group_id(0);
+    unsigned int xsize=get_local_size(0);
+    unsigned int ysize=get_local_size(1);
+    int xBase, yBase, k;
+
+    float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
+
+    const int tid = mad24(yid, xsize, xid);
+
+    float2 prevPt = prevPts[gid] / (float2)(1 << level);
+
+    if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+    {
+        if (tid == 0 && level == 0)
+        {
+            status[gid] = 0;
+        }
+
+        return;
+    }
+    prevPt -= c_halfWin;
+
+    // extract the patch from the first image, compute covariation matrix of derivatives
+
+    float A11 = 0;
+    float A12 = 0;
+    float A22 = 0;
+
+    float I_patch[GRIDSIZE][GRIDSIZE];
+    float dIdx_patch[GRIDSIZE][GRIDSIZE];
+    float dIdy_patch[GRIDSIZE][GRIDSIZE];
+
+    yBase=yid;
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
+                 &A11, &A12, &A22);
+
+
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
+                 &A11, &A12, &A22);
+
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                     &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
+                 &A11, &A12, &A22);
+
+
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
+                 &A11, &A12, &A22);
+
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                     &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    if(yBase<c_winSize_y)
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
+                 &A11, &A12, &A22);
+
+
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
+                 &A11, &A12, &A22);
+
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                     &A11, &A12, &A22);
+    }
+
+    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
+
+    A11 = smem1[0];
+    A12 = smem2[0];
+    A22 = smem3[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float D = A11 * A22 - A12 * A12;
+
+    if (D < 1.192092896e-07f)
+    {
+        if (tid == 0 && level == 0)
+            status[gid] = 0;
+
+        return;
+    }
+
+    A11 /= D;
+    A12 /= D;
+    A22 /= D;
+
+    prevPt = nextPts[gid] * 2.0f - c_halfWin;
+
+    for (k = 0; k < c_iters; ++k)
+    {
+        if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[gid] = 0;
+            return;
+        }
+
+        float b1 = 0;
+        float b2 = 0;
+
+        yBase=yid;
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
+                     &b1, &b2);
+
+
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
+                     &b1, &b2);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                         &b1, &b2);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
+                     &b1, &b2);
+
+
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
+                     &b1, &b2);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                         &b1, &b2);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
+                     &b1, &b2);
+
+
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
+                     &b1, &b2);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                         &b1, &b2);
+        }
+
+        reduce2(b1, b2, smem1, smem2, tid);
+
+        b1 = smem1[0];
+        b2 = smem2[0];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        float2 delta;
+        delta.x = A12 * b2 - A22 * b1;
+        delta.y = A12 * b1 - A11 * b2;
+
+        prevPt += delta;
+
+        if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
+            break;
+    }
+
+    D = 0.0f;
+    if (calcErr)
+    {
+        yBase=yid;
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][0], &D);
+
+
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][1], &D);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[0][2], &D);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][0], &D);
+
+
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][1], &D);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[1][2], &D);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][0], &D);
+
+
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][1], &D);
+
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[2][2], &D);
+        }
+
+        reduce1(D, smem1, tid);
+    }
+
+    if (tid == 0)
+    {
+        prevPt += c_halfWin;
+
+        nextPts[gid] = prevPt;
+
+        if (calcErr)
+            err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
+    }
+}
\ No newline at end of file
diff --git a/modules/video/test/ocl/test_optflowpyrlk.cpp b/modules/video/test/ocl/test_optflowpyrlk.cpp
new file mode 100644
index 000000000..f3ef6363d
--- /dev/null
+++ b/modules/video/test/ocl/test_optflowpyrlk.cpp
@@ -0,0 +1,140 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+
+#ifdef HAVE_OPENCL
+
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
+PARAM_TEST_CASE(PyrLKOpticalFlow, int, int)
+{
+    Size winSize;
+    int maxLevel;
+    TermCriteria criteria;
+    int flags;
+    double minEigThreshold;
+
+    virtual void SetUp()
+    {
+        winSize = Size(GET_PARAM(0), GET_PARAM(0));
+        maxLevel = GET_PARAM(1);
+        criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01);
+        flags = 0;
+        minEigThreshold = 1e-4f;
+    }
+};
+
+OCL_TEST_P(PyrLKOpticalFlow, Mat)
+{
+    cv::Mat frame0 = readImage("optflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+    UMat umatFrame0; frame0.copyTo(umatFrame0);
+
+    cv::Mat frame1 = readImage("optflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    UMat umatFrame1; frame1.copyTo(umatFrame1);
+
+    std::vector<cv::Point2f> pts;
+    cv::goodFeaturesToTrack(frame0, pts, 1000, 0.01, 0.0);
+
+    std::vector<cv::Point2f> nextPtsCPU;
+    std::vector<unsigned char> statusCPU;
+    std::vector<float> errCPU;
+    OCL_OFF(cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPtsCPU, statusCPU, errCPU, winSize, maxLevel, criteria, flags, minEigThreshold));
+
+    UMat umatNextPts, umatStatus, umatErr;
+    OCL_ON(cv::calcOpticalFlowPyrLK(umatFrame0, umatFrame1, pts, umatNextPts, umatStatus, umatErr, winSize, maxLevel, criteria, flags, minEigThreshold));
+    std::vector<cv::Point2f> nextPts(umatNextPts.cols); umatNextPts.copyTo(nextPts);
+    std::vector<unsigned char> status; umatStatus.copyTo(status);
+    std::vector<float> err; umatErr.copyTo(err);
+
+    ASSERT_EQ(nextPtsCPU.size(), nextPts.size());
+    ASSERT_EQ(statusCPU.size(), status.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < nextPts.size(); ++i)
+    {
+        if (status[i] != statusCPU[i])
+        {
+            ++mistmatch;
+            continue;
+        }
+
+        if (status[i])
+        {
+            cv::Point2i a = nextPts[i];
+            cv::Point2i b = nextPtsCPU[i];
+
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+            float errdiff = 0.0f;
+
+            if (!eq || errdiff > 1e-1)
+                ++mistmatch;
+        }
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / (nextPts.size());
+
+    ASSERT_LE(bad_ratio, 0.02f);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlow,
+                            Combine(
+                                Values(21, 25),
+                                Values(3, 5)
+                                )
+                           );
+
+} } // namespace cvtest::ocl
+
+
+#endif // HAVE_OPENCL
\ No newline at end of file