added wrappers for BroxOpticalFlow and interpolateFrames

2011-10-17 13:12:39 +00:00
parent 87f3451ec6
commit b0536279eb
6 changed files with 986 additions and 216 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1485,6 +1485,65 @@ namespace cv
            GpuMat maxPosBuffer;
        };

+        ////////////////////////////////// Optical Flow //////////////////////////////////////////
+
+        class CV_EXPORTS BroxOpticalFlow
+        {
+        public:
+            BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
+                alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), 
+                inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
+            {
+            }
+
+            //! Compute optical flow
+            //! frame0 - source frame (supports only CV_32FC1 type)
+            //! frame1 - frame to track (with the same size and type as frame0)
+            //! u      - flow horizontal component (along x axis)
+            //! v      - flow vertical component (along y axis)
+            void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());
+
+            //! flow smoothness
+	        float alpha;
+
+	        //! gradient constancy importance
+	        float gamma;
+
+	        //! pyramid scale factor
+	        float scale_factor;
+
+	        //! number of lagged non-linearity iterations (inner loop)
+	        int inner_iterations;
+
+	        //! number of warping iterations (number of pyramid levels)
+	        int outer_iterations;
+
+	        //! number of linear system solver iterations
+	        int solver_iterations;
+
+            GpuMat buf;
+        };
+
+        //! Interpolate frames (images) using provided optical flow (displacement field).
+        //! frame0   - frame 0 (32-bit floating point images, single channel)
+        //! frame1   - frame 1 (the same type and size)
+        //! fu       - forward horizontal displacement
+        //! fv       - forward vertical displacement
+        //! bu       - backward horizontal displacement
+        //! bv       - backward vertical displacement
+        //! pos      - new frame position
+        //! newFrame - new frame
+        //! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat;
+        //!            occlusion masks            0, occlusion masks            1,
+        //!            interpolated forward flow  0, interpolated forward flow  1,
+        //!            interpolated backward flow 0, interpolated backward flow 1
+        //!            
+        CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, 
+            const GpuMat& fu, const GpuMat& fv,
+            const GpuMat& bu, const GpuMat& bv, 
+            float pos, GpuMat& newFrame, GpuMat& buf,
+            Stream& stream = Stream::Null());
+
    }

    //! Speckle filtering - filters small connected components on diparity image.
--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
@@ -65,76 +65,6 @@
 #include "opencv2/gpu/device/utility.hpp"


-////////////////////////////////////////////
-template<typename _Tp> class shared_ptr
-{
-public:
-    shared_ptr()  : obj(0), refcount(0) {}
-    shared_ptr(_Tp* _obj);
-    ~shared_ptr() { release(); }
-    shared_ptr(const shared_ptr& ptr);
-    shared_ptr& operator = (const shared_ptr& ptr);
-    void addref() { if( refcount ) (*refcount)+=1; }
-    void release();
-    void delete_obj() { if( obj ) delete obj; }
-    _Tp* operator -> () { return obj; }
-    const _Tp* operator -> () const { return obj; }
-    operator _Tp* () { return obj; }
-    operator const _Tp*() const { return obj; }
-protected:
-    _Tp* obj; //< the object pointer.
-    int* refcount; //< the associated reference counter
-};
-
-template<typename _Tp> inline shared_ptr<_Tp>::shared_ptr(_Tp* _obj) : obj(_obj)
-{
-    if(obj)
-    {
-        refcount = new int;
-        *refcount = 1;
-    }
-    else
-        refcount = 0;
-}
-
-template<typename _Tp> inline void shared_ptr<_Tp>::release()
-{
-    if( refcount)
-    {
-        *refcount -= 1;
-        if (*refcount == 0)
-        {
-            delete_obj();
-            delete refcount;
-        }
-    }
-    refcount = 0;
-    obj = 0;
-}
-
-template<typename _Tp> inline shared_ptr<_Tp>::shared_ptr(const shared_ptr<_Tp>& ptr)
-{
-    obj = ptr.obj;
-    refcount = ptr.refcount;
-    addref();
-}
-
-template<typename _Tp> inline shared_ptr<_Tp>& shared_ptr<_Tp>::operator = (const shared_ptr<_Tp>& ptr)
-{
-    int* _refcount = ptr.refcount;
-    if( _refcount )
-        *_refcount += 1;
-        
-    release();
-    obj = ptr.obj;
-    refcount = _refcount;
-    return *this;
-}
-
-
-////////////////////////////////////////////
-//using std::tr1::shared_ptr;
-
 typedef NCVVectorAlloc<Ncv32f> FloatVector;

 /////////////////////////////////////////////////////////////////////////////////////////
@@ -738,6 +668,42 @@ void InitTextures()
    initTexture1D(tex_numerator_v);
 }

+namespace
+{
+    struct ImagePyramid
+    {
+        std::vector<FloatVector*> img0;
+        std::vector<FloatVector*> img1;
+
+        std::vector<Ncv32u> w;
+        std::vector<Ncv32u> h;
+
+        explicit ImagePyramid(int outer_iterations)
+        {
+            img0.reserve(outer_iterations);
+            img1.reserve(outer_iterations);
+
+            w.reserve(outer_iterations);
+            h.reserve(outer_iterations);
+        }
+
+        ~ImagePyramid()
+        {
+            w.clear();
+            h.clear();
+
+            for (int i = img0.size() - 1; i >= 0; --i)
+            {
+                delete img1[i];
+                delete img0[i];
+            }
+
+            img0.clear();
+            img1.clear();
+        }
+    };
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////
 // MAIN FUNCTION
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -759,21 +725,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
    const Ncv32u kSourceHeight = frame0.height();

    ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
-    ncvAssertReturn(uOut.width() == kSourceWidth   && vOut.width() == kSourceWidth && 
+    ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && 
        uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);

    ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);

    bool kSkipProcessing = gpu_mem_allocator.isCounting();

-    cudaDeviceProp device_props;
    int cuda_device;
-
    ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);

+    cudaDeviceProp device_props;
    ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);

-
    Ncv32u alignmentValue = gpu_mem_allocator.alignment ();

    const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
@@ -817,8 +781,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

    // temporary storage
    SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, 
-        alignUp(kSourceWidth, kStrideAlignmentFloat) 
-        * alignUp(kSourceHeight, kStrideAlignmentFloat));
+        alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));

    // image derivatives
    SAFE_VECTOR_DECL(Ix,  gpu_mem_allocator, kSizeInPixelsAligned);
@@ -831,35 +794,31 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

    // spatial derivative filter size
    const int kDFilterSize = 5;
-    const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
    SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);

-    ncvAssertCUDAReturn(
-        cudaMemcpy(derivativeFilter.ptr(), 
-                   derivativeFilterHost, 
-                   sizeof(float) * kDFilterSize, 
-                   cudaMemcpyHostToDevice),
-        NCV_CUDA_ERROR);
+    if (!kSkipProcessing)
+    {
+        const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};

-    InitTextures();
+        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, 
+            cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+
+        InitTextures();
+    }

    //prepare image pyramid
-    std::vector< shared_ptr<FloatVector> > img0_pyramid;
-    std::vector< shared_ptr<FloatVector> > img1_pyramid;
-
-    std::vector<Ncv32u> w_pyramid;
-    std::vector<Ncv32u> h_pyramid;
+    ImagePyramid pyr(desc.number_of_outer_iterations);

    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();

    float scale = 1.0f;

    //cuda arrays for frames
-    shared_ptr<FloatVector> I0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
-    ncvAssertReturn(I0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    std::auto_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-    shared_ptr<FloatVector> I1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
-    ncvAssertReturn(I1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    std::auto_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

    if (!kSkipProcessing)
    {
@@ -867,25 +826,29 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
        size_t src_width_in_bytes = kSourceWidth * sizeof(float);
        size_t src_pitch_in_bytes = frame0.pitch();
-        ncvAssertCUDAReturn( cudaMemcpy2DAsync(I0->ptr(), dst_width_in_bytes, frame0.ptr(), 
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), 
            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );

-        ncvAssertCUDAReturn( cudaMemcpy2DAsync(I1->ptr(), dst_width_in_bytes, frame1.ptr(), 
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), 
            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
    }

-        //prepare pyramid
-    img0_pyramid.push_back(I0);
-    img1_pyramid.push_back(I1);
+    FloatVector* I0 = pI0.release();
+    FloatVector* I1 = pI1.release();

-    w_pyramid.push_back(kSourceWidth);
-    h_pyramid.push_back(kSourceHeight);
+        //prepare pyramid
+    pyr.img0.push_back(I0);
+    pyr.img1.push_back(I1);
+
+    pyr.w.push_back(kSourceWidth);
+    pyr.h.push_back(kSourceHeight);

    scale *= scale_factor;

    Ncv32u prev_level_width  = kSourceWidth;
    Ncv32u prev_level_height = kSourceHeight;
-    while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(img0_pyramid.size()) < desc.number_of_outer_iterations))
+    while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations))
    {
        //current resolution
        Ncv32u level_width  = static_cast<Ncv32u>(ceilf(kSourceWidth  * scale));
@@ -897,16 +860,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

        Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);

-        shared_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
+        std::auto_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
        ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-        shared_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
+        std::auto_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
        ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-
        if (!kSkipProcessing)
        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+
            NcvSize32u srcSize (prev_level_width, prev_level_height);
            NcvSize32u dstSize (level_width, level_height);
            NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
@@ -921,20 +884,20 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
        }

-        //store pointers
-        img0_pyramid.push_back(level_frame0);
-        img1_pyramid.push_back(level_frame1);
+        I0 = level_frame0.release();
+        I1 = level_frame1.release();

-        w_pyramid.push_back(level_width);
-        h_pyramid.push_back(level_height);
+        //store pointers
+        pyr.img0.push_back(I0);
+        pyr.img1.push_back(I1);
+
+        pyr.w.push_back(level_width);
+        pyr.h.push_back(level_height);

        scale *= scale_factor;

        prev_level_width  = level_width;
        prev_level_height = level_height;
-
-        I0 = level_frame0;
-        I1 = level_frame1;
    }

    if (!kSkipProcessing)
@@ -944,62 +907,56 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);

        //select images with lowest resolution
-        size_t pitch = alignUp(w_pyramid.back(), kStrideAlignmentFloat) * sizeof(float);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, img0_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, img1_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
+        size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-    }

-    FloatVector* ptrU = &u;
-    FloatVector* ptrV = &v;
-    FloatVector* ptrUNew = &u_new;
-    FloatVector* ptrVNew = &v_new;
+        FloatVector* ptrU = &u;
+        FloatVector* ptrV = &v;
+        FloatVector* ptrUNew = &u_new;
+        FloatVector* ptrVNew = &v_new;

-    std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img0Iter = img0_pyramid.rbegin();
-    std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img1Iter = img1_pyramid.rbegin();
-    //outer loop
-    //warping fixed point iteration
-    while(!w_pyramid.empty())
-    {
-        //current grid dimensions
-        const Ncv32u kLevelWidth  = w_pyramid.back();
-        const Ncv32u kLevelHeight = h_pyramid.back();
-        const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
+        std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin();
+        std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin();

-        //size of current image in bytes
-        const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
-
-        //number of points at current resolution
-        const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
-
-        if (!kSkipProcessing)
+        //outer loop
+        //warping fixed point iteration
+        while(!pyr.w.empty())
        {
+            //current grid dimensions
+            const Ncv32u kLevelWidth  = pyr.w.back();
+            const Ncv32u kLevelHeight = pyr.h.back();
+            const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
+
+            //size of current image in bytes
+            const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
+
+            //number of points at current resolution
+            const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
+
            //initial guess for du and dv
            ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
-        }

-        //texture format descriptor
-        cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
+            //texture format descriptor
+            cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();

-        I0 = *img0Iter;
-        I1 = *img1Iter;
+            I0 = *img0Iter;
+            I1 = *img1Iter;

-        ++img0Iter;
-        ++img1Iter;
+            ++img0Iter;
+            ++img1Iter;

-        if (!kSkipProcessing)
-        {
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
-        }
-        //compute derivatives
-        dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
-        dim3 dThreads(32, 6);

-        const int kPitchTex = kLevelStride * sizeof(float);
-        if (!kSkipProcessing)
-        {
+            //compute derivatives
+            dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
+            dim3 dThreads(32, 6);
+
+            const int kPitchTex = kLevelStride * sizeof(float);
+        
            NcvSize32u srcSize(kLevelWidth, kLevelHeight);
            Ncv32u nSrcStep = kLevelStride * sizeof(float);
            NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
@@ -1031,10 +988,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            // Ixy
            nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); 
-        }
-
-        if (!kSkipProcessing)
-        {
+      
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
@@ -1049,23 +1003,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            //    flow increments
            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-        }

+            dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
+            dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);

-        dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
-        dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
+            dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
+            dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);

-        dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
-        dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
-
-        // inner loop
-        // lagged nonlinearity fixed point iteration
-        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-        for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
-        {
-            //compute coefficients
-            if (!kSkipProcessing)
+            // inner loop
+            // lagged nonlinearity fixed point iteration
+            ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+            for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
            {
+                //compute coefficients
                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
                    (diffusivity_x.ptr(), 
                     diffusivity_y.ptr(), 
@@ -1101,13 +1051,12 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            }
-            //solve linear system
-            for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
-            {
-                float omega = 1.99f;
-                if (!kSkipProcessing)
+            
+                //solve linear system
+                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                {
+                    float omega = 1.99f;
+                
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

@@ -1139,33 +1088,29 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                        kLevelWidth, 
                        kLevelHeight, 
                        kLevelStride);
-                }

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            }//end of solver loop
-        }// end of inner loop
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                }//end of solver loop
+            }// end of inner loop

-        //update u and v
-        if (!kSkipProcessing)
-        {
+            //update u and v
            add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
            add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
-        }
-        //prolongate using texture
-        w_pyramid.pop_back();
-        h_pyramid.pop_back();
-        if (!w_pyramid.empty())
-        {
-            //compute new image size
-            Ncv32u nw = w_pyramid.back();
-            Ncv32u nh = h_pyramid.back();
-            Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);

-            dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
-            dim3 p_threads(32, 8);
-            if (!kSkipProcessing)
+            //prolongate using texture
+            pyr.w.pop_back();
+            pyr.h.pop_back();
+            if (!pyr.w.empty())
            {
+                //compute new image size
+                Ncv32u nw = pyr.w.back();
+                Ncv32u nh = pyr.h.back();
+                Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
+
+                dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
+                dim3 p_threads(32, 8);
+            
                NcvSize32u srcSize (kLevelWidth, kLevelHeight);
                NcvSize32u dstSize (nw, nh);
                NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
@@ -1180,27 +1125,27 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                    ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
 					
 				ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
+
+                cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
+                cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
            }
-
-            cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
-            cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
+            scale /= scale_factor;
        }
-        scale /= scale_factor;
+
+        // end of warping iterations
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync
+            (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 
+            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync
+            (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 
+            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
    }

-    // end of warping iterations
-    ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-
-    ncvAssertCUDAReturn( cudaMemcpy2DAsync
-        (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 
-        kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
-
-    ncvAssertCUDAReturn( cudaMemcpy2DAsync
-        (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 
-        kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
-
-    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-
    return NCV_SUCCESS;
 }
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -0,0 +1,198 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA)
+
+void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+#else
+
+namespace
+{
+    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+        CV_Assert(gpuCounter.isInitialized());
+
+        NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
+        CV_Assert(ncvStat == NCV_SUCCESS);
+
+        return gpuCounter.maxSize();
+    }
+}
+
+namespace
+{
+    void outputHandler(const char* msg) 
+    { 
+        CV_Error(CV_GpuApiCallError, msg); 
+    }
+}
+
+void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s) 
+{
+    ncvSetDebugOutputHandler(outputHandler);
+
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+
+    u.create(frame0.size(), CV_32FC1);
+    v.create(frame0.size(), CV_32FC1);
+
+    cudaDeviceProp devProp;
+    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+
+    NCVBroxOpticalFlowDescriptor desc;
+
+    desc.alpha = alpha;
+    desc.gamma = gamma;
+    desc.scale_factor = scale_factor;
+    desc.number_of_inner_iterations = inner_iterations;
+    desc.number_of_outer_iterations = outer_iterations;
+    desc.number_of_solver_iterations = solver_iterations;
+
+    NCVMemSegment frame0MemSeg;
+    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+    frame0MemSeg.size = frame0.step * frame0.rows;
+
+    NCVMemSegment frame1MemSeg;
+    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+    frame1MemSeg.size = frame1.step * frame1.rows;
+
+    NCVMemSegment uMemSeg;
+    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+    uMemSeg.begin.ptr = u.ptr();
+    uMemSeg.size = u.step * u.rows;
+
+    NCVMemSegment vMemSeg;
+    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+    vMemSeg.begin.ptr = v.ptr();
+    vMemSeg.size = v.step * v.rows;
+
+    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, devProp.textureAlignment, frame0.cols, frame0.rows, frame0.step);
+    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, devProp.textureAlignment, frame1.cols, frame1.rows, frame1.step);
+    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, devProp.textureAlignment, u.cols, u.rows, u.step);
+    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, devProp.textureAlignment, v.cols, v.rows, v.step);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+    CV_Assert(gpuAllocator.isInitialized());
+    
+    NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
+    CV_Assert(ncvStat == NCV_SUCCESS);
+}
+
+void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, 
+                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
+{
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
+    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
+    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
+    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
+
+    newFrame.create(frame0.size(), frame0.type());
+
+    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
+    buf.setTo(Scalar::all(0));
+    
+    // occlusion masks
+    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
+    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
+
+    // interpolated forward flow
+    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
+    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
+
+    // interpolated backward flow
+    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
+    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
+
+    size_t step = frame0.step;
+
+    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStStreamHandler h(stream);
+
+    NppStInterpolationState state;
+
+    state.size         = NcvSize32u(frame0.cols, frame0.rows);
+    state.nStep        = static_cast<Ncv32u>(step);
+    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
+    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
+    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
+    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
+    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
+    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
+    state.pos          = pos;
+    state.pNewFrame    = newFrame.ptr<Ncv32f>();
+    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
+    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
+    state.ppBuffers[2] = fui.ptr<Ncv32f>();
+    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
+    state.ppBuffers[4] = bui.ptr<Ncv32f>();
+    state.ppBuffers[5] = bvi.ptr<Ncv32f>();
+
+    nppSafeCall( nppiStInterpolateFrames(&state) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* HAVE_CUDA */
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -76,6 +76,7 @@
    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"
    #include "nvidia/NCVHaarObjectDetection.hpp"
+    #include "nvidia/NCVBroxOpticalFlow.hpp"

 #define CUDART_MINIMUM_REQUIRED_VERSION 4000
 #define NPP_MINIMUM_REQUIRED_VERSION 4000
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@@ -0,0 +1,227 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+//#define DUMP
+
+struct BroxOpticalFlow : testing::TestWithParam< cv::gpu::DeviceInfo >
+{
+    cv::gpu::DeviceInfo devInfo;
+    
+    cv::Mat frame0;
+    cv::Mat frame1;
+
+    cv::Mat u_gold;
+    cv::Mat v_gold;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+        
+        frame0 = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty());
+        frame0.convertTo(frame0, CV_32F, 1.0 / 255.0);
+        
+        frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame1.empty());
+        frame1.convertTo(frame1, CV_32F, 1.0 / 255.0);
+
+#ifndef DUMP
+
+        std::ifstream f((std::string(cvtest::TS::ptr()->get_data_path()) + "opticalflow/opticalflow_gold.bin").c_str(), std::ios_base::binary);
+
+        int rows, cols;
+
+        f.read((char*)&rows, sizeof(rows));
+        f.read((char*)&cols, sizeof(cols));
+
+        u_gold.create(rows, cols, CV_32FC1);
+
+        for (int i = 0; i < u_gold.rows; ++i)
+            f.read((char*)u_gold.ptr(i), u_gold.cols * sizeof(float));
+
+        v_gold.create(rows, cols, CV_32FC1);
+
+        for (int i = 0; i < v_gold.rows; ++i)
+            f.read((char*)v_gold.ptr(i), v_gold.cols * sizeof(float));
+
+#endif
+    }
+};
+
+TEST_P(BroxOpticalFlow, Regression)
+{
+    PRINT_PARAM(devInfo);
+
+    cv::Mat u;
+    cv::Mat v;
+
+    cv::gpu::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/, 
+                                    10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    ASSERT_NO_THROW(
+        cv::gpu::GpuMat d_u; 
+        cv::gpu::GpuMat d_v;
+        d_flow(cv::gpu::GpuMat(frame0), cv::gpu::GpuMat(frame1), d_u, d_v);
+        d_u.download(u);
+        d_v.download(v);
+        d_flow.buf.release();
+    );
+
+#ifndef DUMP
+
+    EXPECT_MAT_NEAR(u_gold, u, 0);
+    EXPECT_MAT_NEAR(v_gold, v, 0);
+
+#else
+
+    std::ofstream f((std::string(cvtest::TS::ptr()->get_data_path()) + "opticalflow/opticalflow_gold.bin").c_str(), std::ios_base::binary);
+
+    f.write((char*)&u.rows, sizeof(u.rows));
+    f.write((char*)&u.cols, sizeof(u.cols));
+
+    for (int i = 0; i < u.rows; ++i)
+        f.write((char*)u.ptr(i), u.cols * sizeof(float));
+
+    for (int i = 0; i < v.rows; ++i)
+        f.write((char*)v.ptr(i), v.cols * sizeof(float));
+
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(Video, BroxOpticalFlow, testing::ValuesIn(devices()));
+
+
+
+
+
+
+struct InterpolateFrames : testing::TestWithParam< cv::gpu::DeviceInfo >
+{
+    cv::gpu::DeviceInfo devInfo;
+    
+    cv::Mat frame0;
+    cv::Mat frame1;
+
+    cv::Mat newFrame_gold;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+        
+        frame0 = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty());
+        frame0.convertTo(frame0, CV_32F, 1.0 / 255.0);
+        
+        frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame1.empty());
+        frame1.convertTo(frame1, CV_32F, 1.0 / 255.0);
+
+#ifndef DUMP
+
+        std::ifstream f((std::string(cvtest::TS::ptr()->get_data_path()) + "opticalflow/interpolate_frames_gold.bin").c_str(), std::ios_base::binary);
+
+        int rows, cols;
+
+        f.read((char*)&rows, sizeof(rows));
+        f.read((char*)&cols, sizeof(cols));
+
+        newFrame_gold.create(rows, cols, CV_32FC1);
+
+        for (int i = 0; i < newFrame_gold.rows; ++i)
+            f.read((char*)newFrame_gold.ptr(i), newFrame_gold.cols * sizeof(float));
+
+#endif
+    }
+};
+
+TEST_P(InterpolateFrames, Regression)
+{
+    PRINT_PARAM(devInfo);
+
+    cv::Mat newFrame;
+
+    cv::gpu::BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/, 
+                                    10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    ASSERT_NO_THROW(
+        cv::gpu::GpuMat d_frame0(frame0);
+        cv::gpu::GpuMat d_frame1(frame1);
+        cv::gpu::GpuMat d_fu; 
+        cv::gpu::GpuMat d_fv;
+        cv::gpu::GpuMat d_bu; 
+        cv::gpu::GpuMat d_bv;
+        cv::gpu::GpuMat d_newFrame;
+        cv::gpu::GpuMat d_buf;
+        d_flow(d_frame0, d_frame1, d_fu, d_fv);
+        d_flow(d_frame1, d_frame0, d_bu, d_bv);
+        cv::gpu::interpolateFrames(d_frame0, d_frame1, d_fu, d_fv, d_bu, d_bv, 0.5f, d_newFrame, d_buf);
+        d_newFrame.download(newFrame);
+        d_flow.buf.release();
+    );
+
+#ifndef DUMP
+
+    EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-4);
+
+#else
+
+    std::ofstream f((std::string(cvtest::TS::ptr()->get_data_path()) + "opticalflow/interpolate_frames_gold.bin").c_str(), std::ios_base::binary);
+
+    f.write((char*)&newFrame.rows, sizeof(newFrame.rows));
+    f.write((char*)&newFrame.cols, sizeof(newFrame.cols));
+
+    for (int i = 0; i < newFrame.rows; ++i)
+        f.write((char*)newFrame.ptr(i), newFrame.cols * sizeof(float));
+
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(Video, InterpolateFrames, testing::ValuesIn(devices()));
+
+#endif
--- a/samples/gpu/optical_flow.cpp
+++ b/samples/gpu/optical_flow.cpp
@@ -0,0 +1,340 @@
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+#ifdef HAVE_CUDA
+#include "NPP_staging/NPP_staging.hpp"
+#endif
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined(HAVE_CUDA)
+
+int main(int argc, const char* argv[])
+{
+    cout << "Please compile the library with CUDA support" << endl;
+    return -1;
+}
+
+#else
+
+#define PARAM_INPUT     "--input"
+#define PARAM_SCALE     "--scale"
+#define PARAM_ALPHA     "--alpha"
+#define PARAM_GAMMA     "--gamma"
+#define PARAM_INNER     "--inner"
+#define PARAM_OUTER     "--outer"
+#define PARAM_SOLVER    "--solver"
+#define PARAM_TIME_STEP "--time-step"
+#define PARAM_HELP      "--help"
+
+void printHelp()
+{
+    cout << "Usage help:\n";
+    cout << setiosflags(ios::left);
+    cout << "\t" << setw(15) << PARAM_ALPHA << " - set alpha\n";
+    cout << "\t" << setw(15) << PARAM_GAMMA << " - set gamma\n";
+    cout << "\t" << setw(15) << PARAM_INNER << " - set number of inner iterations\n";
+    cout << "\t" << setw(15) << PARAM_INPUT << " - specify input file names (2 image files)\n";
+    cout << "\t" << setw(15) << PARAM_OUTER << " - set number of outer iterations\n";
+    cout << "\t" << setw(15) << PARAM_SCALE << " - set pyramid scale factor\n";
+    cout << "\t" << setw(15) << PARAM_SOLVER << " - set number of basic solver iterations\n";
+    cout << "\t" << setw(15) << PARAM_TIME_STEP << " - set frame interpolation time step\n";
+    cout << "\t" << setw(15) << PARAM_HELP << " - display this help message\n";
+}
+
+int processCommandLine(int argc, const char* argv[], float& timeStep, string& frame0Name, string& frame1Name, BroxOpticalFlow& flow)
+{
+    timeStep = 0.25f;
+
+    for (int iarg = 1; iarg < argc; ++iarg)
+    {
+        if (strcmp(argv[iarg], PARAM_INPUT) == 0)
+        {
+            if (iarg + 2 < argc)
+            {
+                frame0Name = argv[++iarg];
+                frame1Name = argv[++iarg];
+            }
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_SCALE) == 0)
+        {
+            if (iarg + 1 < argc)
+                flow.scale_factor = static_cast<float>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_ALPHA) == 0)
+        {
+            if (iarg + 1 < argc)
+                flow.alpha = static_cast<float>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_GAMMA) == 0)
+        {
+            if (iarg + 1 < argc)
+                flow.gamma = static_cast<float>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_INNER) == 0)
+        {
+            if (iarg + 1 < argc)
+                flow.inner_iterations = atoi(argv[++iarg]);
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_OUTER) == 0)
+        {
+            if (iarg + 1 < argc)
+                flow.outer_iterations = atoi(argv[++iarg]);
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_SOLVER) == 0)
+        {
+            if (iarg + 1 < argc)
+                flow.solver_iterations = atoi(argv[++iarg]);
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_TIME_STEP) == 0)
+        {
+            if (iarg + 1 < argc)
+                timeStep = static_cast<float>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_HELP) == 0)
+        {
+            printHelp();
+            return 0;
+        }
+    }
+    return 0;
+}
+
+template <typename T> inline T clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+
+template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
+{
+    x = clamp(x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+
+void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
+{
+    float maxDisplacement = 1.0f;
+
+    for (int i = 0; i < u.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        for (int j = 0; j < u.cols; ++j)
+        {
+            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
+
+            if (d > maxDisplacement) 
+                maxDisplacement = d;
+        }
+    }
+
+    flowField.create(u.size(), CV_8UC4);
+
+    for (int i = 0; i < flowField.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        Vec4b* row = flowField.ptr<Vec4b>(i);
+
+        for (int j = 0; j < flowField.cols; ++j)
+        {
+            row[j][0] = 0;
+            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][3] = 255;
+        }
+    }
+}
+
+int main(int argc, const char* argv[])
+{
+    string frame0Name, frame1Name;
+    float timeStep = 0.01f;
+
+    BroxOpticalFlow d_flow(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/, 
+                           10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
+
+    int result = processCommandLine(argc, argv, timeStep, frame0Name, frame1Name, d_flow);
+    if (argc == 1 || result)
+    {
+        printHelp();
+        return result;
+    }
+
+    if (frame0Name.empty() || frame1Name.empty())
+    {
+        cout << "Missing input file names\n";
+        return -1;
+    }
+
+    Mat frame0Color = imread(frame0Name);
+    Mat frame1Color = imread(frame1Name);
+
+    if (frame0Color.empty() || frame1Color.empty())
+    {
+        cout << "Can't load input images\n";
+        return -1;
+    }
+
+    cout << "OpenCV / NVIDIA Computer Vision\n";
+    cout << "Optical Flow Demo: Frame Interpolation\n";
+    cout << "=========================================\n";
+    cout << "Press:\n ESC to quit\n 'a' to move to the previous frame\n 's' to move to the next frame\n";
+
+    frame0Color.convertTo(frame0Color, CV_32F, 1.0 / 255.0);
+    frame1Color.convertTo(frame1Color, CV_32F, 1.0 / 255.0);
+
+    Mat frame0Gray, frame1Gray;
+
+    cvtColor(frame0Color, frame0Gray, COLOR_BGR2GRAY);
+    cvtColor(frame1Color, frame1Gray, COLOR_BGR2GRAY);
+
+    GpuMat d_frame0(frame0Gray);
+    GpuMat d_frame1(frame1Gray);
+    
+    Mat fu, fv;
+    Mat bu, bv;
+
+    GpuMat d_fu, d_fv;
+    GpuMat d_bu, d_bv;
+
+    cout << "Estimating optical flow\nForward...\n";
+
+    d_flow(d_frame0, d_frame1, d_fu, d_fv);
+    d_flow(d_frame1, d_frame0, d_bu, d_bv);
+    
+    d_fu.download(fu);
+    d_fv.download(fv);
+    
+    d_bu.download(bu);
+    d_bv.download(bv);
+
+    // first frame color components (GPU memory)
+    GpuMat d_b, d_g, d_r;
+
+    // second frame color components (GPU memory)
+    GpuMat d_bt, d_gt, d_rt;
+
+    // prepare color components on host and copy them to device memory
+    Mat channels[3];
+
+    cv::split(frame0Color, channels);
+
+    d_b.upload(channels[0]);
+    d_g.upload(channels[1]);
+    d_r.upload(channels[2]);
+
+    cv::split(frame1Color, channels);
+
+    d_bt.upload(channels[0]);
+    d_gt.upload(channels[1]);
+    d_rt.upload(channels[2]);
+
+    cout << "Interpolating...\n";
+    cout.precision (4);
+    
+    // temporary buffer
+    GpuMat d_buf;
+
+    // intermediate frame color components (GPU memory)
+    GpuMat d_rNew, d_gNew, d_bNew;
+
+    GpuMat d_newFrame;
+
+    vector<Mat> frames;
+    frames.reserve(1.0f / timeStep + 2);
+
+    frames.push_back(frame0Color);
+
+    // compute interpolated frames
+    for (float timePos = timeStep; timePos < 1.0f; timePos += timeStep)
+    {
+        // interpolate blue channel
+        interpolateFrames(d_b, d_bt, d_fu, d_fv, d_bu, d_bv, timePos, d_bNew, d_buf);
+        // interpolate green channel
+        interpolateFrames(d_g, d_gt, d_fu, d_fv, d_bu, d_bv, timePos, d_gNew, d_buf);
+        // interpolate red channel
+        interpolateFrames(d_r, d_rt, d_fu, d_fv, d_bu, d_bv, timePos, d_rNew, d_buf);
+
+        GpuMat channels[] = {d_bNew, d_gNew, d_rNew};
+        merge(channels, 3, d_newFrame);
+
+        Mat newFrame;
+        d_newFrame.download(newFrame);
+
+        frames.push_back(newFrame);
+
+        cout << timePos * 100.0f << "%\r";
+    }
+    cout << setw (5) << "100%\n";
+
+    frames.push_back(frame1Color);
+
+    int currentFrame;
+    currentFrame = 0;
+
+    Mat flowFieldForward;
+    Mat flowFieldBackward;
+
+    getFlowField(fu, fv, flowFieldForward);
+    getFlowField(bu, bv, flowFieldBackward);
+
+    imshow("Forward flow", flowFieldForward);
+    imshow("Backward flow", flowFieldBackward);
+
+    imshow("Interpolated frame", frames[currentFrame]);
+
+    bool qPressed = false;
+    while (!qPressed)
+    {
+        int key = toupper(waitKey(10));
+        switch (key)
+        {
+        case 27:
+            qPressed = true;
+            break;
+        case 'A':
+            if (currentFrame > 0) 
+                --currentFrame;
+
+            imshow("Interpolated frame", frames[currentFrame]);
+            break;
+        case 'S':
+            if (currentFrame < frames.size() - 1)
+                ++currentFrame;
+
+            imshow("Interpolated frame", frames[currentFrame]);
+            break;
+        }
+    }
+
+    return 0;
+}
+
+#endif