renamed gpuoptflow -> cudaoptflow

2013-07-23 17:04:38 +04:00
parent a0ae602bb7
commit 5660d6a680
47 changed files with 45 additions and 45 deletions
--- a/modules/cudaoptflow/src/bm.cpp
+++ b/modules/cudaoptflow/src/bm.cpp
@@ -0,0 +1,204 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::calcOpticalFlowBM(const GpuMat&, const GpuMat&, Size, Size, Size, bool, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace optflowbm
+{
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream);
+}
+
+void cv::cuda::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
+{
+    CV_Assert( prev.type() == CV_8UC1 );
+    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );
+
+    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
+                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);
+
+    velx.create(velSize, CV_32FC1);
+    vely.create(velSize, CV_32FC1);
+
+    // scanning scheme coordinates
+    std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
+    int ssCount = 0;
+
+    // Calculate scanning scheme
+    const int minCount = std::min(maxRange.width, maxRange.height);
+
+    // use spiral search pattern
+    //
+    //     9 10 11 12
+    //     8  1  2 13
+    //     7  *  3 14
+    //     6  5  4 15
+    //... 20 19 18 17
+    //
+
+    for (int i = 0; i < minCount; ++i)
+    {
+        // four cycles along sides
+        int x = -i - 1, y = x;
+
+        // upper side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) ++x;
+            ss[ssCount].y = (short) y;
+        }
+
+        // right side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) x;
+            ss[ssCount].y = (short) ++y;
+        }
+
+        // bottom side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) --x;
+            ss[ssCount].y = (short) y;
+        }
+
+        // left side
+        for (int j = -i; j <= i + 1; ++j, ++ssCount)
+        {
+            ss[ssCount].x = (short) x;
+            ss[ssCount].y = (short) --y;
+        }
+    }
+
+    // the rest part
+    if (maxRange.width < maxRange.height)
+    {
+        const int xleft = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.height; ++i)
+        {
+            // two cycles by x
+            int y = -(i + 1);
+            int x = xleft;
+
+            // upper side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+
+            x = xleft;
+            y = -y;
+
+            // bottom side
+            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+        }
+    }
+    else if (maxRange.width > maxRange.height)
+    {
+        const int yupper = -minCount;
+
+        // cycle by neighbor rings
+        for (int i = minCount; i < maxRange.width; ++i)
+        {
+            // two cycles by y
+            int x = -(i + 1);
+            int y = yupper;
+
+            // left side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+
+            y = yupper;
+            x = -x;
+
+            // right side
+            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
+            {
+                ss[ssCount].x = (short) x;
+                ss[ssCount].y = (short) y;
+            }
+        }
+    }
+
+    const cudaStream_t stream = StreamAccessor::getStream(st);
+
+    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
+    if (stream == 0)
+        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
+    else
+        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );
+
+    const int maxX = prev.cols - blockSize.width;
+    const int maxY = prev.rows - blockSize.height;
+
+    const int SMALL_DIFF = 2;
+    const int BIG_DIFF = 128;
+
+    const int blSize = blockSize.area();
+    const int acceptLevel = blSize * SMALL_DIFF;
+    const int escapeLevel = blSize * BIG_DIFF;
+
+    optflowbm::calc(prev, curr, velx, vely,
+                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
+                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
+}
+
+#endif // HAVE_CUDA
--- a/modules/cudaoptflow/src/bm_fast.cpp
+++ b/modules/cudaoptflow/src/bm_fast.cpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::FastOpticalFlowBM::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace optflowbm_fast
+{
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+void cv::cuda::FastOpticalFlowBM::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window, int block_window, Stream& stream)
+{
+    CV_Assert( I0.type() == CV_8UC1 );
+    CV_Assert( I1.size() == I0.size() && I1.type() == I0.type() );
+
+    int border_size = search_window / 2 + block_window / 2;
+    Size esize = I0.size() + Size(border_size, border_size) * 2;
+
+    ensureSizeIsEnough(esize, I0.type(), extended_I0);
+    ensureSizeIsEnough(esize, I0.type(), extended_I1);
+
+    cuda::copyMakeBorder(I0, extended_I0, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+    cuda::copyMakeBorder(I1, extended_I1, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
+
+    GpuMat I0_hdr = extended_I0(Rect(Point2i(border_size, border_size), I0.size()));
+    GpuMat I1_hdr = extended_I1(Rect(Point2i(border_size, border_size), I0.size()));
+
+    int bcols, brows;
+    optflowbm_fast::get_buffer_size(I0.cols, I0.rows, search_window, block_window, bcols, brows);
+
+    ensureSizeIsEnough(brows, bcols, CV_32SC1, buffer);
+
+    flowx.create(I0.size(), CV_32FC1);
+    flowy.create(I0.size(), CV_32FC1);
+
+    optflowbm_fast::calc<uchar>(I0_hdr, I1_hdr, flowx, flowy, buffer, search_window, block_window, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
--- a/modules/cudaoptflow/src/brox.cpp
+++ b/modules/cudaoptflow/src/brox.cpp
@@ -0,0 +1,130 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)
+
+void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace
+{
+    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
+
+        return gpuCounter.maxSize();
+    }
+}
+
+namespace
+{
+    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
+}
+
+void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
+{
+    ncvSetDebugOutputHandler(outputHandler);
+
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+
+    u.create(frame0.size(), CV_32FC1);
+    v.create(frame0.size(), CV_32FC1);
+
+    cudaDeviceProp devProp;
+    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+
+    NCVBroxOpticalFlowDescriptor desc;
+
+    desc.alpha = alpha;
+    desc.gamma = gamma;
+    desc.scale_factor = scale_factor;
+    desc.number_of_inner_iterations = inner_iterations;
+    desc.number_of_outer_iterations = outer_iterations;
+    desc.number_of_solver_iterations = solver_iterations;
+
+    NCVMemSegment frame0MemSeg;
+    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+    frame0MemSeg.size = frame0.step * frame0.rows;
+
+    NCVMemSegment frame1MemSeg;
+    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+    frame1MemSeg.size = frame1.step * frame1.rows;
+
+    NCVMemSegment uMemSeg;
+    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+    uMemSeg.begin.ptr = u.ptr();
+    uMemSeg.size = u.step * u.rows;
+
+    NCVMemSegment vMemSeg;
+    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+    vMemSeg.begin.ptr = v.ptr();
+    vMemSeg.size = v.step * v.rows;
+
+    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+
+    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
+
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+
+    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
+}
+
+#endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/cuda/bm.cu
+++ b/modules/cudaoptflow/src/cuda/bm.cu
@@ -0,0 +1,169 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace optflowbm
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    {
+        int s = 0;
+
+        for (int y = 0; y < blockSize.y; ++y)
+        {
+            for (int x = 0; x < blockSize.x; ++x)
+                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+        }
+
+        return s;
+    }
+
+    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
+                                  const short2* ss, const int ssCount)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i >= velx.rows || j >= velx.cols)
+            return;
+
+        const int X1 = j * shiftSize.x;
+        const int Y1 = i * shiftSize.y;
+
+        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
+        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
+
+        int X2 = X1 + offX;
+        int Y2 = Y1 + offY;
+
+        int dist = numeric_limits<int>::max();
+
+        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+
+        int countMin = 1;
+        int sumx = offX;
+        int sumy = offY;
+
+        if (dist > acceptLevel)
+        {
+            // do brute-force search
+            for (int k = 0; k < ssCount; ++k)
+            {
+                const short2 ssVal = ss[k];
+
+                const int dx = offX + ssVal.x;
+                const int dy = offY + ssVal.y;
+
+                X2 = X1 + dx;
+                Y2 = Y1 + dy;
+
+                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+                {
+                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    if (tmpDist < acceptLevel)
+                    {
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                        break;
+                    }
+
+                    if (tmpDist < dist)
+                    {
+                        dist = tmpDist;
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                    }
+                    else if (tmpDist == dist)
+                    {
+                        sumx += dx;
+                        sumy += dy;
+                        countMin++;
+                    }
+                }
+            }
+
+            if (dist > escapeLevel)
+            {
+                sumx = offX;
+                sumy = offY;
+                countMin = 1;
+            }
+        }
+
+        velx(i, j) = static_cast<float>(sumx) / countMin;
+        vely(i, j) = static_cast<float>(sumy) / countMin;
+    }
+
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
+    {
+        bindTexture(&tex_prev, prev);
+        bindTexture(&tex_curr, curr);
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
+
+        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
+                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // !defined CUDA_DISABLER
--- a/modules/cudaoptflow/src/cuda/bm_fast.cu
+++ b/modules/cudaoptflow/src/cuda/bm_fast.cu
@@ -0,0 +1,295 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace optflowbm_fast
+{
+    enum
+    {
+        CTA_SIZE = 128,
+
+        TILE_COLS = 128,
+        TILE_ROWS = 32,
+
+        STRIDE = CTA_SIZE
+    };
+
+    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
+    {
+        return ::abs(a - b);
+    }
+
+    template <class T> struct FastOptFlowBM
+    {
+
+        int search_radius;
+        int block_radius;
+
+        int search_window;
+        int block_window;
+
+        PtrStepSz<T> I0;
+        PtrStep<T> I1;
+
+        mutable PtrStepi buffer;
+
+        FastOptFlowBM(int search_window_, int block_window_,
+                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
+                      PtrStepi buffer_) :
+            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
+            search_window(search_window_), block_window(block_window_),
+            I0(I0_), I1(I1_),
+            buffer(buffer_)
+        {
+        }
+
+        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                dist_sums[index] = 0;
+
+                for (int tx = 0; tx < block_window; ++tx)
+                    col_sums(tx, index) = 0;
+
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius;
+
+                for (int tx = -block_radius; tx <= block_radius; ++tx)
+                {
+                    int col_sum = 0;
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    {
+                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
+
+                        dist_sums[index] += dist;
+                        col_sum += dist;
+                    }
+
+                    col_sums(tx + block_radius, index) = col_sum;
+                }
+
+                up_col_sums(j, index) = col_sums(block_window - 1, index);
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j + block_radius;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                int col_sum = 0;
+
+                for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
+
+                dist_sums[index] += col_sum - col_sums(first, index);
+
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            int ay = i;
+            int ax = j + block_radius;
+
+            T a_up   = I0(ay - block_radius - 1, ax);
+            T a_down = I0(ay + block_radius, ax);
+
+            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                T b_up   = I1(by - block_radius - 1, bx);
+                T b_down = I1(by + block_radius, bx);
+
+                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                dist_sums[index] += col_sum  - col_sums(first, index);
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
+        {
+            int bestDist = numeric_limits<int>::max();
+            int bestInd = -1;
+
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int curDist = dist_sums[index];
+                if (curDist < bestDist)
+                {
+                    bestDist = curDist;
+                    bestInd = index;
+                }
+            }
+
+            __shared__ int cta_dist_buffer[CTA_SIZE];
+            __shared__ int cta_ind_buffer[CTA_SIZE];
+
+            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
+
+            if (threadIdx.x == 0)
+            {
+                int y = bestInd / search_window;
+                int x = bestInd - y * search_window;
+
+                velx = x - search_radius;
+                vely = y - search_radius;
+            }
+        }
+
+        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
+        {
+            int tbx = blockIdx.x * TILE_COLS;
+            int tby = blockIdx.y * TILE_ROWS;
+
+            int tex = ::min(tbx + TILE_COLS, I0.cols);
+            int tey = ::min(tby + TILE_ROWS, I0.rows);
+
+            PtrStepi col_sums;
+            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+            col_sums.step = buffer.step;
+
+            PtrStepi up_col_sums;
+            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+            up_col_sums.step = buffer.step;
+
+            extern __shared__ int dist_sums[]; //search_window * search_window
+
+            int first = 0;
+
+            for (int i = tby; i < tey; ++i)
+            {
+                for (int j = tbx; j < tex; ++j)
+                {
+                    __syncthreads();
+
+                    if (j == tbx)
+                    {
+                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                        first = 0;
+                    }
+                    else
+                    {
+                        if (i == tby)
+                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                        else
+                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                        first = (first + 1) % block_window;
+                    }
+
+                    __syncthreads();
+
+                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
+                }
+            }
+        }
+
+    };
+
+    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
+    {
+        fbm(velx, vely);
+    }
+
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+    {
+        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
+
+        buffer_cols = search_window * search_window * grid.y;
+        buffer_rows = src_cols + block_window * grid.x;
+    }
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
+    {
+        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
+
+        dim3 block(CTA_SIZE, 1);
+        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
+
+        size_t smem = search_window * search_window * sizeof(int);
+
+        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
+        cudaSafeCall ( cudaGetLastError () );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+#endif // !defined CUDA_DISABLER
--- a/modules/cudaoptflow/src/cuda/farneback.cu
+++ b/modules/cudaoptflow/src/cuda/farneback.cu
@@ -0,0 +1,656 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+#define tx threadIdx.x
+#define ty threadIdx.y
+#define bx blockIdx.x
+#define by blockIdx.y
+#define bdx blockDim.x
+#define bdy blockDim.y
+
+#define BORDER_SIZE 5
+#define MAX_KSIZE_HALF 100
+
+namespace cv { namespace cuda { namespace device { namespace optflow_farneback
+{
+    __constant__ float c_g[8];
+    __constant__ float c_xg[8];
+    __constant__ float c_xxg[8];
+    __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
+
+
+    template <int polyN>
+    __global__ void polynomialExpansion(
+            const int height, const int width, const PtrStepf src, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * (bdx - 2*polyN) + tx - polyN;
+
+        if (y < height)
+        {
+            extern __shared__ float smem[];
+            volatile float *row = smem + tx;
+            int xWarped = ::min(::max(x, 0), width - 1);
+
+            row[0] = src(y, xWarped) * c_g[0];
+            row[bdx] = 0.f;
+            row[2*bdx] = 0.f;
+
+            for (int k = 1; k <= polyN; ++k)
+            {
+                float t0 = src(::max(y - k, 0), xWarped);
+                float t1 = src(::min(y + k, height - 1), xWarped);
+
+                row[0] += c_g[k] * (t0 + t1);
+                row[bdx] += c_xg[k] * (t1 - t0);
+                row[2*bdx] += c_xxg[k] * (t0 + t1);
+            }
+
+            __syncthreads();
+
+            if (tx >= polyN && tx + polyN < bdx && x < width)
+            {
+                float b1 = c_g[0] * row[0];
+                float b3 = c_g[0] * row[bdx];
+                float b5 = c_g[0] * row[2*bdx];
+                float b2 = 0, b4 = 0, b6 = 0;
+
+                for (int k = 1; k <= polyN; ++k)
+                {
+                    b1 += (row[k] + row[-k]) * c_g[k];
+                    b4 += (row[k] + row[-k]) * c_xxg[k];
+                    b2 += (row[k] - row[-k]) * c_xg[k];
+                    b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
+                    b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
+                    b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
+                }
+
+                dst(y, xWarped) = b3*c_ig11;
+                dst(height + y, xWarped) = b2*c_ig11;
+                dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
+                dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
+                dst(4*height + y, xWarped) = b6*c_ig55;
+            }
+        }
+    }
+
+
+    void setPolynomialExpansionConsts(
+            int polyN, const float *g, const float *xg, const float *xxg,
+            float ig11, float ig03, float ig33, float ig55)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
+    }
+
+
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
+    {
+        dim3 block(256);
+        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
+        int smem = 3 * block.x * sizeof(float);
+
+        if (polyN == 5)
+            polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
+        else if (polyN == 7)
+            polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __constant__ float c_border[BORDER_SIZE + 1];
+
+    __global__ void updateMatrices(
+            const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
+            const PtrStepf R0, const PtrStepf R1, PtrStepf M)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        if (y < height && x < width)
+        {
+            float dx = flowx(y, x);
+            float dy = flowy(y, x);
+            float fx = x + dx;
+            float fy = y + dy;
+
+            int x1 = floorf(fx);
+            int y1 = floorf(fy);
+            fx -= x1; fy -= y1;
+
+            float r2, r3, r4, r5, r6;
+
+            if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
+            {
+                float a00 = (1.f - fx) * (1.f - fy);
+                float a01 = fx * (1.f - fy);
+                float a10 = (1.f - fx) * fy;
+                float a11 = fx * fy;
+
+                r2 = a00 * R1(y1, x1) +
+                     a01 * R1(y1, x1 + 1) +
+                     a10 * R1(y1 + 1, x1) +
+                     a11 * R1(y1 + 1, x1 + 1);
+
+                r3 = a00 * R1(height + y1, x1) +
+                     a01 * R1(height + y1, x1 + 1) +
+                     a10 * R1(height + y1 + 1, x1) +
+                     a11 * R1(height + y1 + 1, x1 + 1);
+
+                r4 = a00 * R1(2*height + y1, x1) +
+                     a01 * R1(2*height + y1, x1 + 1) +
+                     a10 * R1(2*height + y1 + 1, x1) +
+                     a11 * R1(2*height + y1 + 1, x1 + 1);
+
+                r5 = a00 * R1(3*height + y1, x1) +
+                     a01 * R1(3*height + y1, x1 + 1) +
+                     a10 * R1(3*height + y1 + 1, x1) +
+                     a11 * R1(3*height + y1 + 1, x1 + 1);
+
+                r6 = a00 * R1(4*height + y1, x1) +
+                     a01 * R1(4*height + y1, x1 + 1) +
+                     a10 * R1(4*height + y1 + 1, x1) +
+                     a11 * R1(4*height + y1 + 1, x1 + 1);
+
+                r4 = (R0(2*height + y, x) + r4) * 0.5f;
+                r5 = (R0(3*height + y, x) + r5) * 0.5f;
+                r6 = (R0(4*height + y, x) + r6) * 0.25f;
+            }
+            else
+            {
+                r2 = r3 = 0.f;
+                r4 = R0(2*height + y, x);
+                r5 = R0(3*height + y, x);
+                r6 = R0(4*height + y, x) * 0.5f;
+            }
+
+            r2 = (R0(y, x) - r2) * 0.5f;
+            r3 = (R0(height + y, x) - r3) * 0.5f;
+
+            r2 += r4*dy + r6*dx;
+            r3 += r6*dy + r5*dx;
+
+            float scale =
+                    c_border[::min(x, BORDER_SIZE)] *
+                    c_border[::min(y, BORDER_SIZE)] *
+                    c_border[::min(width - x - 1, BORDER_SIZE)] *
+                    c_border[::min(height - y - 1, BORDER_SIZE)];
+
+            r2 *= scale; r3 *= scale; r4 *= scale;
+            r5 *= scale; r6 *= scale;
+
+            M(y, x) = r4*r4 + r6*r6;
+            M(height + y, x) = (r4 + r5)*r6;
+            M(2*height + y, x) = r5*r5 + r6*r6;
+            M(3*height + y, x) = r4*r2 + r6*r3;
+            M(4*height + y, x) = r6*r2 + r5*r3;
+        }
+    }
+
+
+    void setUpdateMatricesConsts()
+    {
+        static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
+        cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
+    }
+
+
+    void updateMatricesGpu(
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
+
+        updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __global__ void updateFlow(
+            const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        if (y < height && x < width)
+        {
+            float g11 = M(y, x);
+            float g12 = M(height + y, x);
+            float g22 = M(2*height + y, x);
+            float h1 = M(3*height + y, x);
+            float h2 = M(4*height + y, x);
+
+            float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
+
+            flowx(y, x) = (g11*h2 - g12*h1) * detInv;
+            flowy(y, x) = (g22*h1 - g12*h2) * detInv;
+        }
+    }
+
+
+    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
+
+        updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    /*__global__ void boxFilter(
+            const int height, const int width, const PtrStepf src,
+            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = ::min(::max(xExt, 0), width - 1);
+
+                row[i] = src(y, xExt);
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal passs
+                row += tx + ksizeHalf;
+                float res = row[0];
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    res += row[-i] + row[i];
+                dst(y, x) = res * boxAreaInv;
+            }
+        }
+    }
+
+
+    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        dim3 block(256);
+        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }*/
+
+
+    __global__ void boxFilter5(
+            const int height, const int width, const PtrStepf src,
+            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+
+        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+        volatile float *row = smem + 5 * ty * smw;
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = ::min(::max(xExt, 0), width - 1);
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] = src(k*height + y, xExt);
+
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        row[k*smw + i] +=
+                                src(k*height + ::max(y - j, 0), xExt) +
+                                src(k*height + ::min(y + j, height - 1), xExt);
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal passs
+
+                row += tx + ksizeHalf;
+                float res[5];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    res[k] = row[k*smw];
+
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        res[k] += row[k*smw - i] + row[k*smw + i];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    dst(k*height + y, x) = res[k] * boxAreaInv;
+            }
+        }
+    }
+
+
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(256);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(128);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
+
+    template <typename Border>
+    __global__ void gaussianBlur(
+            const int height, const int width, const PtrStepf src, const int ksizeHalf,
+            const Border b, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = b.idx_col(xExt);
+                row[i] = src(y, xExt) * c_gKer[0];
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    row[i] +=
+                            (src(b.idx_row_low(y - j), xExt) +
+                             src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal pass
+                row += tx + ksizeHalf;
+                float res = row[0] * c_gKer[0];
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    res += (row[-i] + row[i]) * c_gKer[i];
+                dst(y, x) = res;
+            }
+        }
+    }
+
+
+    void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
+    }
+
+
+    template <typename Border>
+    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows;
+        int width = src.cols;
+
+        dim3 block(256);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
+        Border b(height, width);
+
+        gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void gaussianBlurGpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            0 /*gaussianBlurCaller<BrdConstant<float> >*/,
+            gaussianBlurCaller<BrdReplicate<float> >,
+            0 /*gaussianBlurCaller<BrdReflect<float> >*/,
+            0 /*gaussianBlurCaller<BrdWrap<float> >*/,
+            gaussianBlurCaller<BrdReflect101<float> >
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+
+    template <typename Border>
+    __global__ void gaussianBlur5(
+            const int height, const int width, const PtrStepf src, const int ksizeHalf,
+            const Border b, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+
+        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+        volatile float *row = smem + 5 * ty * smw;
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = b.idx_col(xExt);
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
+
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        row[k*smw + i] +=
+                                (src(k*height + b.idx_row_low(y - j), xExt) +
+                                 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal pass
+
+                row += tx + ksizeHalf;
+                float res[5];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    res[k] = row[k*smw] * c_gKer[0];
+
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    dst(k*height + y, x) = res[k];
+            }
+        }
+    }
+
+
+    template <typename Border, int blockDimX>
+    void gaussianBlur5Caller(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(blockDimX);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+        Border b(height, width);
+
+        gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void gaussianBlur5Gpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            0 /*gaussianBlur5Caller<BrdConstant<float>,256>*/,
+            gaussianBlur5Caller<BrdReplicate<float>,256>,
+            0 /*gaussianBlur5Caller<BrdReflect<float>,256>*/,
+            0 /*gaussianBlur5Caller<BrdWrap<float>,256>*/,
+            gaussianBlur5Caller<BrdReflect101<float>,256>
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+    void gaussianBlur5Gpu_CC11(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            0 /*gaussianBlur5Caller<BrdConstant<float>,128>*/,
+            gaussianBlur5Caller<BrdReplicate<float>,128>,
+            0 /*gaussianBlur5Caller<BrdReflect<float>,128>*/,
+            0 /*gaussianBlur5Caller<BrdWrap<float>,128>*/,
+            gaussianBlur5Caller<BrdReflect101<float>,128>
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaoptflow/src/cuda/needle_map.cu
+++ b/modules/cudaoptflow/src/cuda/needle_map.cu
@@ -0,0 +1,220 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace optical_flow
+    {
+        #define NEEDLE_MAP_SCALE 16
+        #define NUM_VERTS_PER_ARROW 6
+
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        {
+            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
+
+            volatile float* u_col_sum = smem;
+            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
+
+            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
+            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
+
+            u_col_sum[threadIdx.x] = 0;
+            v_col_sum[threadIdx.x] = 0;
+
+            #pragma unroll
+            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
+            {
+                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
+                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
+            }
+
+            if (threadIdx.x < 8)
+            {
+                // now add the column sums
+                const uint X = threadIdx.x;
+
+                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
+                }
+
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
+                }
+
+                if (X | 0xf8 == 0xf8)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
+                }
+
+                if (X == 0)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
+                }
+            }
+
+            if (threadIdx.x == 0)
+            {
+                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
+
+                u_col_sum[0] *= coeff;
+                v_col_sum[0] *= coeff;
+
+                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
+                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
+            }
+        }
+
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
+        {
+            const dim3 block(NEEDLE_MAP_SCALE);
+            const dim3 grid(u_avg.cols, u_avg.rows);
+
+            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            // test - just draw a triangle at each pixel
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+
+            float3 v[NUM_VERTS_PER_ARROW];
+
+            if (x < u_avg.cols && y < u_avg.rows)
+            {
+                const float u_avg_val = u_avg(y, x);
+                const float v_avg_val = v_avg(y, x);
+
+                const float theta = ::atan2f(v_avg_val, u_avg_val);
+
+                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
+                r = fmin(14.0f * (r / max_flow), 14.0f);
+
+                v[0].z = 1.0f;
+                v[1].z = 0.7f;
+                v[2].z = 0.7f;
+                v[3].z = 0.7f;
+                v[4].z = 0.7f;
+                v[5].z = 1.0f;
+
+                v[0].x = arrow_x;
+                v[0].y = arrow_y;
+                v[5].x = arrow_x;
+                v[5].y = arrow_y;
+
+                v[2].x = arrow_x + r * ::cosf(theta);
+                v[2].y = arrow_y + r * ::sinf(theta);
+                v[3].x = v[2].x;
+                v[3].y = v[2].y;
+
+                r = ::fmin(r, 2.5f);
+
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
+
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
+
+                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[0].x * xscale;
+                vertex_data[indx++] = v[0].y * yscale;
+                vertex_data[indx++] = v[0].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[1].x * xscale;
+                vertex_data[indx++] = v[1].y * yscale;
+                vertex_data[indx++] = v[1].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[2].x * xscale;
+                vertex_data[indx++] = v[2].y * yscale;
+                vertex_data[indx++] = v[2].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[3].x * xscale;
+                vertex_data[indx++] = v[3].y * yscale;
+                vertex_data[indx++] = v[3].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[4].x * xscale;
+                vertex_data[indx++] = v[4].y * yscale;
+                vertex_data[indx++] = v[4].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[5].x * xscale;
+                vertex_data[indx++] = v[5].y * yscale;
+                vertex_data[indx++] = v[5].z;
+            }
+        }
+
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            const dim3 block(16);
+            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
+
+            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -0,0 +1,560 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace pyrlk
+{
+    __constant__ int c_winSize_x;
+    __constant__ int c_winSize_y;
+    __constant__ int c_halfWin_x;
+    __constant__ int c_halfWin_y;
+    __constant__ int c_iters;
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    template <int cn> struct Tex_I;
+    template <> struct Tex_I<1>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_If, x, y);
+        }
+    };
+    template <> struct Tex_I<4>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_If4, x, y);
+        }
+    };
+
+    template <int cn> struct Tex_J;
+    template <> struct Tex_J<1>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_Jf, x, y);
+        }
+    };
+    template <> struct Tex_J<4>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_Jf4, x, y);
+        }
+    };
+
+    __device__ __forceinline__ void accum(float& dst, float val)
+    {
+        dst += val;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float4& val)
+    {
+        dst += val.x + val.y + val.z;
+    }
+
+    __device__ __forceinline__ float abs_(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ float4 abs_(const float4& a)
+    {
+        return abs(a);
+    }
+
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
+    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    {
+    #if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
+    #else
+        const int BLOCK_SIZE = 256;
+    #endif
+
+        __shared__ float smem1[BLOCK_SIZE];
+        __shared__ float smem2[BLOCK_SIZE];
+        __shared__ float smem3[BLOCK_SIZE];
+
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        float2 prevPt = prevPts[blockIdx.x];
+        prevPt.x *= (1.0f / (1 << level));
+        prevPt.y *= (1.0f / (1 << level));
+
+        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        prevPt.x -= c_halfWin_x;
+        prevPt.y -= c_halfWin_y;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+
+        float A11 = 0;
+        float A12 = 0;
+        float A22 = 0;
+
+        typedef typename TypeVec<float, cn>::vec_type work_type;
+
+        work_type I_patch   [PATCH_Y][PATCH_X];
+        work_type dIdx_patch[PATCH_Y][PATCH_X];
+        work_type dIdy_patch[PATCH_Y][PATCH_X];
+
+        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
+        {
+            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
+            {
+                float x = prevPt.x + xBase + 0.5f;
+                float y = prevPt.y + yBase + 0.5f;
+
+                I_patch[i][j] = Tex_I<cn>::read(x, y);
+
+                // Sharr Deriv
+
+                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
+
+                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
+
+                dIdx_patch[i][j] = dIdx;
+                dIdy_patch[i][j] = dIdy;
+
+                accum(A11, dIdx * dIdx);
+                accum(A12, dIdx * dIdy);
+                accum(A22, dIdy * dIdy);
+            }
+        }
+
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+
+    #if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+    #endif
+
+        __syncthreads();
+
+        A11 = smem1[0];
+        A12 = smem2[0];
+        A22 = smem3[0];
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt = nextPts[blockIdx.x];
+        nextPt.x *= 2.f;
+        nextPt.y *= 2.f;
+
+        nextPt.x -= c_halfWin_x;
+        nextPt.y -= c_halfWin_y;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+            {
+                if (tid == 0 && level == 0)
+                    status[blockIdx.x] = 0;
+
+                return;
+            }
+
+            float b1 = 0;
+            float b2 = 0;
+
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type diff = (J_val - I_val) * 32.0f;
+
+                    accum(b1, diff * dIdx_patch[i][j]);
+                    accum(b2, diff * dIdy_patch[i][j]);
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+
+        #if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+        #endif
+
+            __syncthreads();
+
+            b1 = smem1[0];
+            b2 = smem2[0];
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        float errval = 0;
+        if (calcErr)
+        {
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type diff = J_val - I_val;
+
+                    accum(errval, abs_(diff));
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
+        }
+
+        if (tid == 0)
+        {
+            nextPt.x += c_halfWin_x;
+            nextPt.y += c_halfWin_y;
+
+            nextPts[blockIdx.x] = nextPt;
+
+            if (calcErr)
+                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
+        }
+    }
+
+    template <int cn, int PATCH_X, int PATCH_Y>
+    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                       int level, dim3 block, cudaStream_t stream)
+    {
+        dim3 grid(ptcount);
+
+        if (level == 0 && err)
+            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+        else
+            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <bool calcErr>
+    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    {
+        extern __shared__ int smem[];
+
+        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
+        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
+
+        int* I_patch = smem;
+        int* dIdx_patch = I_patch + patchWidth * patchHeight;
+        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
+
+        const int xBase = blockIdx.x * blockDim.x;
+        const int yBase = blockIdx.y * blockDim.y;
+
+        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
+        {
+            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
+            {
+                float x = xBase - c_halfWin_x + j + 0.5f;
+                float y = yBase - c_halfWin_y + i + 0.5f;
+
+                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
+
+                // Sharr Deriv
+
+                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
+                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
+
+                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
+                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
+            }
+        }
+
+        __syncthreads();
+
+        const int x = xBase + threadIdx.x;
+        const int y = yBase + threadIdx.y;
+
+        if (x >= cols || y >= rows)
+            return;
+
+        int A11i = 0;
+        int A12i = 0;
+        int A22i = 0;
+
+        for (int i = 0; i < c_winSize_y; ++i)
+        {
+            for (int j = 0; j < c_winSize_x; ++j)
+            {
+                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                A11i += dIdx * dIdx;
+                A12i += dIdx * dIdy;
+                A22i += dIdy * dIdy;
+            }
+        }
+
+        float A11 = A11i;
+        float A12 = A12i;
+        float A22 = A22i;
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (calcErr)
+                err(y, x) = numeric_limits<float>::max();
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt;
+        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
+        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
+            {
+                if (calcErr)
+                    err(y, x) = numeric_limits<float>::max();
+
+                return;
+            }
+
+            int b1 = 0;
+            int b2 = 0;
+
+            for (int i = 0; i < c_winSize_y; ++i)
+            {
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    int diff = (J - I) * 32;
+
+                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                    b1 += diff * dIdx;
+                    b2 += diff * dIdy;
+                }
+            }
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        u(y, x) = nextPt.x - x;
+        v(y, x) = nextPt.y - y;
+
+        if (calcErr)
+        {
+            int errval = 0;
+
+            for (int i = 0; i < c_winSize_y; ++i)
+            {
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    errval += ::abs(J - I);
+                }
+            }
+
+            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
+        }
+    }
+
+    void loadConstants(int2 winSize, int iters)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+
+        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+    }
+
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
+            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
+            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
+            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
+            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
+        };
+
+        bindTexture(&tex_If, I);
+        bindTexture(&tex_Jf, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
+            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
+            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
+            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
+            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
+        };
+
+        bindTexture(&tex_If4, I);
+        bindTexture(&tex_Jf4, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
+    {
+        dim3 block(16, 16);
+        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
+
+        bindTexture(&tex_Ib, I);
+        bindTexture(&tex_Jf, J);
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        const int patchWidth  = block.x + 2 * halfWin.x;
+        const int patchHeight = block.y + 2 * halfWin.y;
+        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
+
+        if (err.data)
+        {
+            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+        else
+        {
+            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -0,0 +1,332 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+////////////////////////////////////////////////////////////
+// centeredGradient
+
+namespace tvl1flow
+{
+    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
+        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
+    }
+
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// warpBackward
+
+namespace tvl1flow
+{
+    static __device__ __forceinline__ float bicubicCoeff(float x_)
+    {
+        float x = fabsf(x_);
+        if (x <= 1.0f)
+        {
+            return x * x * (1.5f * x - 2.5f) + 1.0f;
+        }
+        else if (x < 2.0f)
+        {
+            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+        }
+        else
+        {
+            return 0.0f;
+        }
+    }
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I0.cols || y >= I0.rows)
+            return;
+
+        const float u1Val = u1(y, x);
+        const float u2Val = u2(y, x);
+
+        const float wx = x + u1Val;
+        const float wy = y + u2Val;
+
+        const int xmin = ::ceilf(wx - 2.0f);
+        const int xmax = ::floorf(wx + 2.0f);
+
+        const int ymin = ::ceilf(wy - 2.0f);
+        const int ymax = ::floorf(wy + 2.0f);
+
+        float sum  = 0.0f;
+        float sumx = 0.0f;
+        float sumy = 0.0f;
+        float wsum = 0.0f;
+
+        for (int cy = ymin; cy <= ymax; ++cy)
+        {
+            for (int cx = xmin; cx <= xmax; ++cx)
+            {
+                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
+
+                sum  += w * tex2D(tex_I1 , cx, cy);
+                sumx += w * tex2D(tex_I1x, cx, cy);
+                sumy += w * tex2D(tex_I1y, cx, cy);
+
+                wsum += w;
+            }
+        }
+
+        const float coeff = 1.0f / wsum;
+
+        const float I1wVal  = sum  * coeff;
+        const float I1wxVal = sumx * coeff;
+        const float I1wyVal = sumy * coeff;
+
+        I1w(y, x)  = I1wVal;
+        I1wx(y, x) = I1wxVal;
+        I1wy(y, x) = I1wyVal;
+
+        const float Ix2 = I1wxVal * I1wxVal;
+        const float Iy2 = I1wyVal * I1wyVal;
+
+        // store the |Grad(I1)|^2
+        grad(y, x) = Ix2 + Iy2;
+
+        // compute the constant part of the rho function
+        const float I0Val = I0(y, x);
+        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
+    }
+
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
+
+        bindTexture(&tex_I1 , I1);
+        bindTexture(&tex_I1x, I1x);
+        bindTexture(&tex_I1y, I1y);
+
+        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateU
+
+namespace tvl1flow
+{
+    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
+    {
+        if (x > 0 && y > 0)
+        {
+            const float v1x = v1(y, x) - v1(y, x - 1);
+            const float v2y = v2(y, x) - v2(y - 1, x);
+            return v1x + v2y;
+        }
+        else
+        {
+            if (y > 0)
+                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
+            else
+            {
+                if (x > 0)
+                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
+                else
+                    return v1(0, 0) + v2(0, 0);
+            }
+        }
+    }
+
+    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
+                              const PtrStepf grad, const PtrStepf rho_c,
+                              const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
+                              PtrStepf u1, PtrStepf u2, PtrStepf error,
+                              const float l_t, const float theta)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= I1wx.cols || y >= I1wx.rows)
+            return;
+
+        const float I1wxVal = I1wx(y, x);
+        const float I1wyVal = I1wy(y, x);
+        const float gradVal = grad(y, x);
+        const float u1OldVal = u1(y, x);
+        const float u2OldVal = u2(y, x);
+
+        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
+
+        // estimate the values of the variable (v1, v2) (thresholding operator TH)
+
+        float d1 = 0.0f;
+        float d2 = 0.0f;
+
+        if (rho < -l_t * gradVal)
+        {
+            d1 = l_t * I1wxVal;
+            d2 = l_t * I1wyVal;
+        }
+        else if (rho > l_t * gradVal)
+        {
+            d1 = -l_t * I1wxVal;
+            d2 = -l_t * I1wyVal;
+        }
+        else if (gradVal > numeric_limits<float>::epsilon())
+        {
+            const float fi = -rho / gradVal;
+            d1 = fi * I1wxVal;
+            d2 = fi * I1wyVal;
+        }
+
+        const float v1 = u1OldVal + d1;
+        const float v2 = u2OldVal + d2;
+
+        // compute the divergence of the dual variable (p1, p2)
+
+        const float div_p1 = divergence(p11, p12, y, x);
+        const float div_p2 = divergence(p21, p22, y, x);
+
+        // estimate the values of the optical flow (u1, u2)
+
+        const float u1NewVal = v1 + theta * div_p1;
+        const float u2NewVal = v2 + theta * div_p2;
+
+        u1(y, x) = u1NewVal;
+        u2(y, x) = u2NewVal;
+
+        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
+        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
+        error(y, x) = n1 + n2;
+    }
+
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
+                   float l_t, float theta)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
+
+        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+////////////////////////////////////////////////////////////
+// estimateDualVariables
+
+namespace tvl1flow
+{
+    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= u1.cols || y >= u1.rows)
+            return;
+
+        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
+        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
+
+        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
+        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
+
+        const float g1 = ::hypotf(u1x, u1y);
+        const float g2 = ::hypotf(u2x, u2y);
+
+        const float ng1 = 1.0f + taut * g1;
+        const float ng2 = 1.0f + taut * g2;
+
+        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
+        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
+        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
+        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
+    }
+
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
+
+        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // !defined CUDA_DISABLER
--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
@@ -0,0 +1,378 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#define MIN_SIZE 32
+
+#define S(x) StreamAccessor::getStream(x)
+
+// GPU resize() is fast, but it differs from the CPU analog. Disabling this flag
+// leads to an inefficient code. It's for debug purposes only.
+#define ENABLE_GPU_RESIZE 1
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device { namespace optflow_farneback
+{
+    void setPolynomialExpansionConsts(
+            int polyN, const float *g, const float *xg, const float *xxg,
+            float ig11, float ig03, float ig33, float ig55);
+
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream);
+
+    void setUpdateMatricesConsts();
+
+    void updateMatricesGpu(
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream);
+
+    void updateFlowGpu(
+            const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);
+
+    /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/
+
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
+
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
+
+    void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
+
+    void gaussianBlurGpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
+
+    void gaussianBlur5Gpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
+
+    void gaussianBlur5Gpu_CC11(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
+
+}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
+
+
+void cv::cuda::FarnebackOpticalFlow::prepareGaussian(
+        int n, double sigma, float *g, float *xg, float *xxg,
+        double &ig11, double &ig03, double &ig33, double &ig55)
+{
+    double s = 0.;
+    for (int x = -n; x <= n; x++)
+    {
+        g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
+        s += g[x];
+    }
+
+    s = 1./s;
+    for (int x = -n; x <= n; x++)
+    {
+        g[x] = (float)(g[x]*s);
+        xg[x] = (float)(x*g[x]);
+        xxg[x] = (float)(x*x*g[x]);
+    }
+
+    Mat_<double> G(6, 6);
+    G.setTo(0);
+
+    for (int y = -n; y <= n; y++)
+    {
+        for (int x = -n; x <= n; x++)
+        {
+            G(0,0) += g[y]*g[x];
+            G(1,1) += g[y]*g[x]*x*x;
+            G(3,3) += g[y]*g[x]*x*x*x*x;
+            G(5,5) += g[y]*g[x]*x*x*y*y;
+        }
+    }
+
+    //G[0][0] = 1.;
+    G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
+    G(4,4) = G(3,3);
+    G(3,4) = G(4,3) = G(5,5);
+
+    // invG:
+    // [ x        e  e    ]
+    // [    y             ]
+    // [       y          ]
+    // [ e        z       ]
+    // [ e           z    ]
+    // [                u ]
+    Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
+
+    ig11 = invG(1,1);
+    ig03 = invG(0,3);
+    ig33 = invG(3,3);
+    ig55 = invG(5,5);
+}
+
+
+void cv::cuda::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
+{
+    std::vector<float> buf(n*6 + 3);
+    float* g = &buf[0] + n;
+    float* xg = g + n*2 + 1;
+    float* xxg = xg + n*2 + 1;
+
+    if (sigma < FLT_EPSILON)
+        sigma = n*0.3;
+
+    double ig11, ig03, ig33, ig55;
+    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
+
+    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
+}
+
+
+void cv::cuda::FarnebackOpticalFlow::updateFlow_boxFilter(
+        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
+        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+{
+    if (deviceSupports(FEATURE_SET_COMPUTE_12))
+        device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
+    else
+        device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
+    swap(M, bufM);
+
+    for (int i = 1; i < 5; ++i)
+        streams[i].waitForCompletion();
+    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
+
+    if (updateMatrices)
+        device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
+}
+
+
+void cv::cuda::FarnebackOpticalFlow::updateFlow_gaussianBlur(
+        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
+        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
+{
+    if (deviceSupports(FEATURE_SET_COMPUTE_12))
+        device::optflow_farneback::gaussianBlur5Gpu(
+                    M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
+    else
+        device::optflow_farneback::gaussianBlur5Gpu_CC11(
+                    M, blockSize/2, bufM, BORDER_REPLICATE, S(streams[0]));
+    swap(M, bufM);
+
+    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
+
+    if (updateMatrices)
+        device::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
+}
+
+
+void cv::cuda::FarnebackOpticalFlow::operator ()(
+        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
+{
+    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
+    CV_Assert(frame0.size() == frame1.size());
+    CV_Assert(polyN == 5 || polyN == 7);
+    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
+
+    Stream streams[5];
+    if (S(s))
+        streams[0] = s;
+
+    Size size = frame0.size();
+    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
+
+    flowx.create(size, CV_32F);
+    flowy.create(size, CV_32F);
+    GpuMat flowx0 = flowx;
+    GpuMat flowy0 = flowy;
+
+    // Crop unnecessary levels
+    double scale = 1;
+    int numLevelsCropped = 0;
+    for (; numLevelsCropped < numLevels; numLevelsCropped++)
+    {
+        scale *= pyrScale;
+        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
+            break;
+    }
+
+    frame0.convertTo(frames_[0], CV_32F, streams[0]);
+    frame1.convertTo(frames_[1], CV_32F, streams[1]);
+
+    if (fastPyramids)
+    {
+        // Build Gaussian pyramids using pyrDown()
+        pyramid0_.resize(numLevelsCropped + 1);
+        pyramid1_.resize(numLevelsCropped + 1);
+        pyramid0_[0] = frames_[0];
+        pyramid1_[0] = frames_[1];
+        for (int i = 1; i <= numLevelsCropped; ++i)
+        {
+            cuda::pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
+            cuda::pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
+        }
+    }
+
+    setPolynomialExpansionConsts(polyN, polySigma);
+    device::optflow_farneback::setUpdateMatricesConsts();
+
+    for (int k = numLevelsCropped; k >= 0; k--)
+    {
+        streams[0].waitForCompletion();
+
+        scale = 1;
+        for (int i = 0; i < k; i++)
+            scale *= pyrScale;
+
+        double sigma = (1./scale - 1) * 0.5;
+        int smoothSize = cvRound(sigma*5) | 1;
+        smoothSize = std::max(smoothSize, 3);
+
+        int width = cvRound(size.width*scale);
+        int height = cvRound(size.height*scale);
+
+        if (fastPyramids)
+        {
+            width = pyramid0_[k].cols;
+            height = pyramid0_[k].rows;
+        }
+
+        if (k > 0)
+        {
+            curFlowX.create(height, width, CV_32F);
+            curFlowY.create(height, width, CV_32F);
+        }
+        else
+        {
+            curFlowX = flowx0;
+            curFlowY = flowy0;
+        }
+
+        if (!prevFlowX.data)
+        {
+            if (flags & OPTFLOW_USE_INITIAL_FLOW)
+            {
+                cuda::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+                cuda::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+                curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
+                curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
+            }
+            else
+            {
+                curFlowX.setTo(0, streams[0]);
+                curFlowY.setTo(0, streams[1]);
+            }
+        }
+        else
+        {
+            cuda::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
+            cuda::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
+            curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]);
+            curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]);
+        }
+
+        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
+        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
+        GpuMat R[2] =
+        {
+            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
+            allocMatFromBuf(5*height, width, CV_32F, R_[1])
+        };
+
+        if (fastPyramids)
+        {
+            device::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
+            device::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
+        }
+        else
+        {
+            GpuMat blurredFrame[2] =
+            {
+                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
+                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
+            };
+            GpuMat pyrLevel[2] =
+            {
+                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
+                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
+            };
+
+            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
+            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
+
+            for (int i = 0; i < 2; i++)
+            {
+                device::optflow_farneback::gaussianBlurGpu(
+                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i]));
+                cuda::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
+                device::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
+            }
+        }
+
+        streams[1].waitForCompletion();
+        device::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));
+
+        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
+        {
+            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
+            device::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
+        }
+        for (int i = 0; i < numIters; i++)
+        {
+            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
+                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
+            else
+                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
+        }
+
+        prevFlowX = curFlowX;
+        prevFlowY = curFlowY;
+    }
+
+    flowx = curFlowX;
+    flowy = curFlowY;
+
+    if (!S(s))
+        streams[0].waitForCompletion();
+}
+
+#endif
--- a/modules/cudaoptflow/src/interpolate_frames.cpp
+++ b/modules/cudaoptflow/src/interpolate_frames.cpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)
+
+void cv::cuda::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+void cv::cuda::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
+                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
+{
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
+    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
+    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
+    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
+
+    newFrame.create(frame0.size(), frame0.type());
+
+    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
+    buf.setTo(Scalar::all(0));
+
+    // occlusion masks
+    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
+    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
+
+    // interpolated forward flow
+    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
+    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
+
+    // interpolated backward flow
+    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
+    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
+
+    size_t step = frame0.step;
+
+    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStStreamHandler h(stream);
+
+    NppStInterpolationState state;
+
+    state.size         = NcvSize32u(frame0.cols, frame0.rows);
+    state.nStep        = static_cast<Ncv32u>(step);
+    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
+    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
+    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
+    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
+    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
+    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
+    state.pos          = pos;
+    state.pNewFrame    = newFrame.ptr<Ncv32f>();
+    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
+    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
+    state.ppBuffers[2] = fui.ptr<Ncv32f>();
+    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
+    state.ppBuffers[4] = bui.ptr<Ncv32f>();
+    state.ppBuffers[5] = bvi.ptr<Ncv32f>();
+
+    ncvSafeCall( nppiStInterpolateFrames(&state) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/needle_map.cpp
+++ b/modules/cudaoptflow/src/needle_map.cpp
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::createOpticalFlowNeedleMap(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace optical_flow
+    {
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg);
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale);
+    }
+}}}
+
+void cv::cuda::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors)
+{
+    using namespace cv::cuda::device::optical_flow;
+
+    CV_Assert(u.type() == CV_32FC1);
+    CV_Assert(v.type() == u.type() && v.size() == u.size());
+
+    const int NEEDLE_MAP_SCALE = 16;
+
+    const int x_needles = u.cols / NEEDLE_MAP_SCALE;
+    const int y_needles = u.rows / NEEDLE_MAP_SCALE;
+
+    GpuMat u_avg(y_needles, x_needles, CV_32FC1);
+    GpuMat v_avg(y_needles, x_needles, CV_32FC1);
+
+    NeedleMapAverage_gpu(u, v, u_avg, v_avg);
+
+    const int NUM_VERTS_PER_ARROW = 6;
+
+    const int num_arrows = x_needles * y_needles * NUM_VERTS_PER_ARROW;
+
+    vertex.create(1, num_arrows, CV_32FC3);
+    colors.create(1, num_arrows, CV_32FC3);
+
+    colors.setTo(Scalar::all(1.0));
+
+    double uMax, vMax;
+    cuda::minMax(u_avg, 0, &uMax);
+    cuda::minMax(v_avg, 0, &vMax);
+
+    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));
+
+    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);
+
+    cuda::cvtColor(colors, colors, COLOR_HSV2RGB);
+}
+
+#endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/precomp.cpp
+++ b/modules/cudaoptflow/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/cudaoptflow/src/precomp.hpp
+++ b/modules/cudaoptflow/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/cudaimgproc.hpp"
+#include "opencv2/video.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@@ -0,0 +1,250 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
+void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
+void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
+void cv::cuda::PyrLKOpticalFlow::releaseMemory() {}
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace pyrlk
+{
+    void loadConstants(int2 winSize, int iters);
+
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
+}
+
+cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow()
+{
+    winSize = Size(21, 21);
+    maxLevel = 3;
+    iters = 30;
+    useInitialFlow = false;
+}
+
+namespace
+{
+    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
+    {
+        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
+        {
+            block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
+            block.y = 8;
+        }
+        else
+        {
+            block.x = 16;
+            block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
+        }
+
+        patch.x = (winSize.width  + block.x - 1) / block.x;
+        patch.y = (winSize.height + block.y - 1) / block.y;
+
+        block.z = patch.z = 1;
+    }
+}
+
+void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
+{
+    if (prevPts.empty())
+    {
+        nextPts.release();
+        status.release();
+        if (err) err->release();
+        return;
+    }
+
+    dim3 block, patch;
+    calcPatchSize(winSize, block, patch);
+
+    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
+    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
+    CV_Assert(maxLevel >= 0);
+    CV_Assert(winSize.width > 2 && winSize.height > 2);
+    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
+    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
+
+    if (useInitialFlow)
+        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
+    else
+        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+
+    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
+    GpuMat temp2 = nextPts.reshape(1);
+    cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
+
+    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+    status.setTo(Scalar::all(1));
+
+    if (err)
+        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+
+    // build the image pyramids.
+
+    prevPyr_.resize(maxLevel + 1);
+    nextPyr_.resize(maxLevel + 1);
+
+    int cn = prevImg.channels();
+
+    if (cn == 1 || cn == 4)
+    {
+        prevImg.convertTo(prevPyr_[0], CV_32F);
+        nextImg.convertTo(nextPyr_[0], CV_32F);
+    }
+    else
+    {
+        cuda::cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
+        buf_.convertTo(prevPyr_[0], CV_32F);
+
+        cuda::cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
+        buf_.convertTo(nextPyr_[0], CV_32F);
+    }
+
+    for (int level = 1; level <= maxLevel; ++level)
+    {
+        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
+        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
+    }
+
+    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
+
+    for (int level = maxLevel; level >= 0; level--)
+    {
+        if (cn == 1)
+        {
+            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
+                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                level, block, patch);
+        }
+        else
+        {
+            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
+                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                level, block, patch);
+        }
+    }
+}
+
+void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
+{
+    CV_Assert(prevImg.type() == CV_8UC1);
+    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
+    CV_Assert(maxLevel >= 0);
+    CV_Assert(winSize.width > 2 && winSize.height > 2);
+
+    if (err)
+        err->create(prevImg.size(), CV_32FC1);
+
+    // build the image pyramids.
+
+    prevPyr_.resize(maxLevel + 1);
+    nextPyr_.resize(maxLevel + 1);
+
+    prevPyr_[0] = prevImg;
+    nextImg.convertTo(nextPyr_[0], CV_32F);
+
+    for (int level = 1; level <= maxLevel; ++level)
+    {
+        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
+        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
+    }
+
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
+    uPyr_[0].setTo(Scalar::all(0));
+    vPyr_[0].setTo(Scalar::all(0));
+    uPyr_[1].setTo(Scalar::all(0));
+    vPyr_[1].setTo(Scalar::all(0));
+
+    int2 winSize2i = make_int2(winSize.width, winSize.height);
+    pyrlk::loadConstants(winSize2i, iters);
+
+    PtrStepSzf derr = err ? *err : PtrStepSzf();
+
+    int idx = 0;
+
+    for (int level = maxLevel; level >= 0; level--)
+    {
+        int idx2 = (idx + 1) & 1;
+
+        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
+            level == 0 ? derr : PtrStepSzf(), winSize2i);
+
+        if (level > 0)
+            idx = idx2;
+    }
+
+    uPyr_[idx].copyTo(u);
+    vPyr_[idx].copyTo(v);
+}
+
+void cv::cuda::PyrLKOpticalFlow::releaseMemory()
+{
+    prevPyr_.clear();
+    nextPyr_.clear();
+
+    buf_.release();
+
+    uPyr_[0].release();
+    vPyr_[0].release();
+
+    uPyr_[1].release();
+    vPyr_[1].release();
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp
@@ -0,0 +1,259 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+cv::cuda::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() { throw_no_cuda(); }
+void cv::cuda::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::OpticalFlowDual_TVL1_GPU::collectGarbage() {}
+void cv::cuda::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+#else
+
+using namespace cv;
+using namespace cv::cuda;
+
+cv::cuda::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU()
+{
+    tau            = 0.25;
+    lambda         = 0.15;
+    theta          = 0.3;
+    nscales        = 5;
+    warps          = 5;
+    epsilon        = 0.01;
+    iterations     = 300;
+    scaleStep      = 0.8;
+    useInitialFlow = false;
+}
+
+void cv::cuda::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
+{
+    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
+    CV_Assert( I0.size() == I1.size() );
+    CV_Assert( I0.type() == I1.type() );
+    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
+    CV_Assert( nscales > 0 );
+
+    // allocate memory for the pyramid structure
+    I0s.resize(nscales);
+    I1s.resize(nscales);
+    u1s.resize(nscales);
+    u2s.resize(nscales);
+
+    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
+    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
+
+    if (!useInitialFlow)
+    {
+        flowx.create(I0.size(), CV_32FC1);
+        flowy.create(I0.size(), CV_32FC1);
+    }
+
+    u1s[0] = flowx;
+    u2s[0] = flowy;
+
+    I1x_buf.create(I0.size(), CV_32FC1);
+    I1y_buf.create(I0.size(), CV_32FC1);
+
+    I1w_buf.create(I0.size(), CV_32FC1);
+    I1wx_buf.create(I0.size(), CV_32FC1);
+    I1wy_buf.create(I0.size(), CV_32FC1);
+
+    grad_buf.create(I0.size(), CV_32FC1);
+    rho_c_buf.create(I0.size(), CV_32FC1);
+
+    p11_buf.create(I0.size(), CV_32FC1);
+    p12_buf.create(I0.size(), CV_32FC1);
+    p21_buf.create(I0.size(), CV_32FC1);
+    p22_buf.create(I0.size(), CV_32FC1);
+
+    diff_buf.create(I0.size(), CV_32FC1);
+
+    // create the scales
+    for (int s = 1; s < nscales; ++s)
+    {
+        cuda::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
+        cuda::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
+
+        if (I0s[s].cols < 16 || I0s[s].rows < 16)
+        {
+            nscales = s;
+            break;
+        }
+
+        if (useInitialFlow)
+        {
+            cuda::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
+            cuda::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
+
+            cuda::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
+            cuda::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
+        }
+        else
+        {
+            u1s[s].create(I0s[s].size(), CV_32FC1);
+            u2s[s].create(I0s[s].size(), CV_32FC1);
+        }
+    }
+
+    if (!useInitialFlow)
+    {
+        u1s[nscales-1].setTo(Scalar::all(0));
+        u2s[nscales-1].setTo(Scalar::all(0));
+    }
+
+    // pyramidal structure for computing the optical flow
+    for (int s = nscales - 1; s >= 0; --s)
+    {
+        // compute the optical flow at the current scale
+        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
+
+        // if this was the last scale, finish now
+        if (s == 0)
+            break;
+
+        // otherwise, upsample the optical flow
+
+        // zoom the optical flow for the next finer scale
+        cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
+        cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
+
+        // scale the optical flow with the appropriate zoom factor
+        cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
+        cuda::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
+    }
+}
+
+namespace tvl1flow
+{
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
+    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
+                   PtrStepSzf grad, PtrStepSzf rho_c,
+                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
+                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
+                   float l_t, float theta);
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut);
+}
+
+void cv::cuda::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2)
+{
+    using namespace tvl1flow;
+
+    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
+
+    CV_DbgAssert( I1.size() == I0.size() );
+    CV_DbgAssert( I1.type() == I0.type() );
+    CV_DbgAssert( u1.size() == I0.size() );
+    CV_DbgAssert( u2.size() == u1.size() );
+
+    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
+    centeredGradient(I1, I1x, I1y);
+
+    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
+    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
+    p11.setTo(Scalar::all(0));
+    p12.setTo(Scalar::all(0));
+    p21.setTo(Scalar::all(0));
+    p22.setTo(Scalar::all(0));
+
+    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
+
+    const float l_t = static_cast<float>(lambda * theta);
+    const float taut = static_cast<float>(tau / theta);
+
+    for (int warpings = 0; warpings < warps; ++warpings)
+    {
+        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
+
+        double error = std::numeric_limits<double>::max();
+        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
+        {
+            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));
+
+            if (epsilon > 0)
+                error = cuda::sum(diff, norm_buf)[0];
+
+            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
+        }
+    }
+}
+
+void cv::cuda::OpticalFlowDual_TVL1_GPU::collectGarbage()
+{
+    I0s.clear();
+    I1s.clear();
+    u1s.clear();
+    u2s.clear();
+
+    I1x_buf.release();
+    I1y_buf.release();
+
+    I1w_buf.release();
+    I1wx_buf.release();
+    I1wy_buf.release();
+
+    grad_buf.release();
+    rho_c_buf.release();
+
+    p11_buf.release();
+    p12_buf.release();
+    p21_buf.release();
+    p22_buf.release();
+
+    diff_buf.release();
+    norm_buf.release();
+}
+
+#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)