added wrappers for BroxOpticalFlow and interpolateFrames

2011-10-17 13:12:39 +00:00
parent 87f3451ec6
commit b0536279eb
6 changed files with 986 additions and 216 deletions
--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
@@ -65,76 +65,6 @@
 #include "opencv2/gpu/device/utility.hpp"


-////////////////////////////////////////////
-template<typename _Tp> class shared_ptr
-{
-public:
-    shared_ptr()  : obj(0), refcount(0) {}
-    shared_ptr(_Tp* _obj);
-    ~shared_ptr() { release(); }
-    shared_ptr(const shared_ptr& ptr);
-    shared_ptr& operator = (const shared_ptr& ptr);
-    void addref() { if( refcount ) (*refcount)+=1; }
-    void release();
-    void delete_obj() { if( obj ) delete obj; }
-    _Tp* operator -> () { return obj; }
-    const _Tp* operator -> () const { return obj; }
-    operator _Tp* () { return obj; }
-    operator const _Tp*() const { return obj; }
-protected:
-    _Tp* obj; //< the object pointer.
-    int* refcount; //< the associated reference counter
-};
-
-template<typename _Tp> inline shared_ptr<_Tp>::shared_ptr(_Tp* _obj) : obj(_obj)
-{
-    if(obj)
-    {
-        refcount = new int;
-        *refcount = 1;
-    }
-    else
-        refcount = 0;
-}
-
-template<typename _Tp> inline void shared_ptr<_Tp>::release()
-{
-    if( refcount)
-    {
-        *refcount -= 1;
-        if (*refcount == 0)
-        {
-            delete_obj();
-            delete refcount;
-        }
-    }
-    refcount = 0;
-    obj = 0;
-}
-
-template<typename _Tp> inline shared_ptr<_Tp>::shared_ptr(const shared_ptr<_Tp>& ptr)
-{
-    obj = ptr.obj;
-    refcount = ptr.refcount;
-    addref();
-}
-
-template<typename _Tp> inline shared_ptr<_Tp>& shared_ptr<_Tp>::operator = (const shared_ptr<_Tp>& ptr)
-{
-    int* _refcount = ptr.refcount;
-    if( _refcount )
-        *_refcount += 1;
-        
-    release();
-    obj = ptr.obj;
-    refcount = _refcount;
-    return *this;
-}
-
-
-////////////////////////////////////////////
-//using std::tr1::shared_ptr;
-
 typedef NCVVectorAlloc<Ncv32f> FloatVector;

 /////////////////////////////////////////////////////////////////////////////////////////
@@ -738,6 +668,42 @@ void InitTextures()
    initTexture1D(tex_numerator_v);
 }

+namespace
+{
+    struct ImagePyramid
+    {
+        std::vector<FloatVector*> img0;
+        std::vector<FloatVector*> img1;
+
+        std::vector<Ncv32u> w;
+        std::vector<Ncv32u> h;
+
+        explicit ImagePyramid(int outer_iterations)
+        {
+            img0.reserve(outer_iterations);
+            img1.reserve(outer_iterations);
+
+            w.reserve(outer_iterations);
+            h.reserve(outer_iterations);
+        }
+
+        ~ImagePyramid()
+        {
+            w.clear();
+            h.clear();
+
+            for (int i = img0.size() - 1; i >= 0; --i)
+            {
+                delete img1[i];
+                delete img0[i];
+            }
+
+            img0.clear();
+            img1.clear();
+        }
+    };
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////
 // MAIN FUNCTION
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -759,21 +725,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
    const Ncv32u kSourceHeight = frame0.height();

    ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
-    ncvAssertReturn(uOut.width() == kSourceWidth   && vOut.width() == kSourceWidth && 
+    ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && 
        uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);

    ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);

    bool kSkipProcessing = gpu_mem_allocator.isCounting();

-    cudaDeviceProp device_props;
    int cuda_device;
-
    ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);

+    cudaDeviceProp device_props;
    ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);

-
    Ncv32u alignmentValue = gpu_mem_allocator.alignment ();

    const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
@@ -817,8 +781,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

    // temporary storage
    SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, 
-        alignUp(kSourceWidth, kStrideAlignmentFloat) 
-        * alignUp(kSourceHeight, kStrideAlignmentFloat));
+        alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));

    // image derivatives
    SAFE_VECTOR_DECL(Ix,  gpu_mem_allocator, kSizeInPixelsAligned);
@@ -831,35 +794,31 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

    // spatial derivative filter size
    const int kDFilterSize = 5;
-    const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
    SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);

-    ncvAssertCUDAReturn(
-        cudaMemcpy(derivativeFilter.ptr(), 
-                   derivativeFilterHost, 
-                   sizeof(float) * kDFilterSize, 
-                   cudaMemcpyHostToDevice),
-        NCV_CUDA_ERROR);
+    if (!kSkipProcessing)
+    {
+        const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};

-    InitTextures();
+        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, 
+            cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+
+        InitTextures();
+    }

    //prepare image pyramid
-    std::vector< shared_ptr<FloatVector> > img0_pyramid;
-    std::vector< shared_ptr<FloatVector> > img1_pyramid;
-
-    std::vector<Ncv32u> w_pyramid;
-    std::vector<Ncv32u> h_pyramid;
+    ImagePyramid pyr(desc.number_of_outer_iterations);

    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();

    float scale = 1.0f;

    //cuda arrays for frames
-    shared_ptr<FloatVector> I0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
-    ncvAssertReturn(I0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    std::auto_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-    shared_ptr<FloatVector> I1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
-    ncvAssertReturn(I1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    std::auto_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

    if (!kSkipProcessing)
    {
@@ -867,25 +826,29 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
        size_t src_width_in_bytes = kSourceWidth * sizeof(float);
        size_t src_pitch_in_bytes = frame0.pitch();
-        ncvAssertCUDAReturn( cudaMemcpy2DAsync(I0->ptr(), dst_width_in_bytes, frame0.ptr(), 
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), 
            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );

-        ncvAssertCUDAReturn( cudaMemcpy2DAsync(I1->ptr(), dst_width_in_bytes, frame1.ptr(), 
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), 
            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
    }

-        //prepare pyramid
-    img0_pyramid.push_back(I0);
-    img1_pyramid.push_back(I1);
+    FloatVector* I0 = pI0.release();
+    FloatVector* I1 = pI1.release();

-    w_pyramid.push_back(kSourceWidth);
-    h_pyramid.push_back(kSourceHeight);
+        //prepare pyramid
+    pyr.img0.push_back(I0);
+    pyr.img1.push_back(I1);
+
+    pyr.w.push_back(kSourceWidth);
+    pyr.h.push_back(kSourceHeight);

    scale *= scale_factor;

    Ncv32u prev_level_width  = kSourceWidth;
    Ncv32u prev_level_height = kSourceHeight;
-    while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(img0_pyramid.size()) < desc.number_of_outer_iterations))
+    while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations))
    {
        //current resolution
        Ncv32u level_width  = static_cast<Ncv32u>(ceilf(kSourceWidth  * scale));
@@ -897,16 +860,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

        Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);

-        shared_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
+        std::auto_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
        ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-        shared_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
+        std::auto_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
        ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-
        if (!kSkipProcessing)
        {
+            ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+
            NcvSize32u srcSize (prev_level_width, prev_level_height);
            NcvSize32u dstSize (level_width, level_height);
            NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
@@ -921,20 +884,20 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
        }

-        //store pointers
-        img0_pyramid.push_back(level_frame0);
-        img1_pyramid.push_back(level_frame1);
+        I0 = level_frame0.release();
+        I1 = level_frame1.release();

-        w_pyramid.push_back(level_width);
-        h_pyramid.push_back(level_height);
+        //store pointers
+        pyr.img0.push_back(I0);
+        pyr.img1.push_back(I1);
+
+        pyr.w.push_back(level_width);
+        pyr.h.push_back(level_height);

        scale *= scale_factor;

        prev_level_width  = level_width;
        prev_level_height = level_height;
-
-        I0 = level_frame0;
-        I1 = level_frame1;
    }

    if (!kSkipProcessing)
@@ -944,62 +907,56 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);

        //select images with lowest resolution
-        size_t pitch = alignUp(w_pyramid.back(), kStrideAlignmentFloat) * sizeof(float);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, img0_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, img1_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
+        size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-    }

-    FloatVector* ptrU = &u;
-    FloatVector* ptrV = &v;
-    FloatVector* ptrUNew = &u_new;
-    FloatVector* ptrVNew = &v_new;
+        FloatVector* ptrU = &u;
+        FloatVector* ptrV = &v;
+        FloatVector* ptrUNew = &u_new;
+        FloatVector* ptrVNew = &v_new;

-    std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img0Iter = img0_pyramid.rbegin();
-    std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img1Iter = img1_pyramid.rbegin();
-    //outer loop
-    //warping fixed point iteration
-    while(!w_pyramid.empty())
-    {
-        //current grid dimensions
-        const Ncv32u kLevelWidth  = w_pyramid.back();
-        const Ncv32u kLevelHeight = h_pyramid.back();
-        const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
+        std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin();
+        std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin();

-        //size of current image in bytes
-        const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
-
-        //number of points at current resolution
-        const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
-
-        if (!kSkipProcessing)
+        //outer loop
+        //warping fixed point iteration
+        while(!pyr.w.empty())
        {
+            //current grid dimensions
+            const Ncv32u kLevelWidth  = pyr.w.back();
+            const Ncv32u kLevelHeight = pyr.h.back();
+            const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
+
+            //size of current image in bytes
+            const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
+
+            //number of points at current resolution
+            const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
+
            //initial guess for du and dv
            ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
-        }

-        //texture format descriptor
-        cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
+            //texture format descriptor
+            cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();

-        I0 = *img0Iter;
-        I1 = *img1Iter;
+            I0 = *img0Iter;
+            I1 = *img1Iter;

-        ++img0Iter;
-        ++img1Iter;
+            ++img0Iter;
+            ++img1Iter;

-        if (!kSkipProcessing)
-        {
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
-        }
-        //compute derivatives
-        dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
-        dim3 dThreads(32, 6);

-        const int kPitchTex = kLevelStride * sizeof(float);
-        if (!kSkipProcessing)
-        {
+            //compute derivatives
+            dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
+            dim3 dThreads(32, 6);
+
+            const int kPitchTex = kLevelStride * sizeof(float);
+        
            NcvSize32u srcSize(kLevelWidth, kLevelHeight);
            Ncv32u nSrcStep = kLevelStride * sizeof(float);
            NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
@@ -1031,10 +988,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            // Ixy
            nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); 
-        }
-
-        if (!kSkipProcessing)
-        {
+      
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
@@ -1049,23 +1003,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            //    flow increments
            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-        }

+            dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
+            dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);

-        dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
-        dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
+            dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
+            dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);

-        dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
-        dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
-
-        // inner loop
-        // lagged nonlinearity fixed point iteration
-        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-        for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
-        {
-            //compute coefficients
-            if (!kSkipProcessing)
+            // inner loop
+            // lagged nonlinearity fixed point iteration
+            ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+            for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
            {
+                //compute coefficients
                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
                    (diffusivity_x.ptr(), 
                     diffusivity_y.ptr(), 
@@ -1101,13 +1051,12 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            }
-            //solve linear system
-            for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
-            {
-                float omega = 1.99f;
-                if (!kSkipProcessing)
+            
+                //solve linear system
+                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                {
+                    float omega = 1.99f;
+                
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

@@ -1139,33 +1088,29 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                        kLevelWidth, 
                        kLevelHeight, 
                        kLevelStride);
-                }

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            }//end of solver loop
-        }// end of inner loop
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                }//end of solver loop
+            }// end of inner loop

-        //update u and v
-        if (!kSkipProcessing)
-        {
+            //update u and v
            add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
            add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
-        }
-        //prolongate using texture
-        w_pyramid.pop_back();
-        h_pyramid.pop_back();
-        if (!w_pyramid.empty())
-        {
-            //compute new image size
-            Ncv32u nw = w_pyramid.back();
-            Ncv32u nh = h_pyramid.back();
-            Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);

-            dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
-            dim3 p_threads(32, 8);
-            if (!kSkipProcessing)
+            //prolongate using texture
+            pyr.w.pop_back();
+            pyr.h.pop_back();
+            if (!pyr.w.empty())
            {
+                //compute new image size
+                Ncv32u nw = pyr.w.back();
+                Ncv32u nh = pyr.h.back();
+                Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
+
+                dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
+                dim3 p_threads(32, 8);
+            
                NcvSize32u srcSize (kLevelWidth, kLevelHeight);
                NcvSize32u dstSize (nw, nh);
                NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
@@ -1180,27 +1125,27 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                    ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
 					
 				ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
+
+                cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
+                cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
            }
-
-            cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
-            cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
+            scale /= scale_factor;
        }
-        scale /= scale_factor;
+
+        // end of warping iterations
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync
+            (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 
+            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync
+            (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 
+            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
    }

-    // end of warping iterations
-    ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-
-    ncvAssertCUDAReturn( cudaMemcpy2DAsync
-        (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 
-        kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
-
-    ncvAssertCUDAReturn( cudaMemcpy2DAsync
-        (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 
-        kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
-
-    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
-
    return NCV_SUCCESS;
 }
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -0,0 +1,198 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA)
+
+void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+#else
+
+namespace
+{
+    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+        CV_Assert(gpuCounter.isInitialized());
+
+        NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
+        CV_Assert(ncvStat == NCV_SUCCESS);
+
+        return gpuCounter.maxSize();
+    }
+}
+
+namespace
+{
+    void outputHandler(const char* msg) 
+    { 
+        CV_Error(CV_GpuApiCallError, msg); 
+    }
+}
+
+void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s) 
+{
+    ncvSetDebugOutputHandler(outputHandler);
+
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+
+    u.create(frame0.size(), CV_32FC1);
+    v.create(frame0.size(), CV_32FC1);
+
+    cudaDeviceProp devProp;
+    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+
+    NCVBroxOpticalFlowDescriptor desc;
+
+    desc.alpha = alpha;
+    desc.gamma = gamma;
+    desc.scale_factor = scale_factor;
+    desc.number_of_inner_iterations = inner_iterations;
+    desc.number_of_outer_iterations = outer_iterations;
+    desc.number_of_solver_iterations = solver_iterations;
+
+    NCVMemSegment frame0MemSeg;
+    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+    frame0MemSeg.size = frame0.step * frame0.rows;
+
+    NCVMemSegment frame1MemSeg;
+    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+    frame1MemSeg.size = frame1.step * frame1.rows;
+
+    NCVMemSegment uMemSeg;
+    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+    uMemSeg.begin.ptr = u.ptr();
+    uMemSeg.size = u.step * u.rows;
+
+    NCVMemSegment vMemSeg;
+    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+    vMemSeg.begin.ptr = v.ptr();
+    vMemSeg.size = v.step * v.rows;
+
+    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, devProp.textureAlignment, frame0.cols, frame0.rows, frame0.step);
+    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, devProp.textureAlignment, frame1.cols, frame1.rows, frame1.step);
+    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, devProp.textureAlignment, u.cols, u.rows, u.step);
+    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, devProp.textureAlignment, v.cols, v.rows, v.step);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+    CV_Assert(gpuAllocator.isInitialized());
+    
+    NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
+    CV_Assert(ncvStat == NCV_SUCCESS);
+}
+
+void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, 
+                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
+{
+    CV_Assert(frame0.type() == CV_32FC1);
+    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
+    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
+    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
+    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
+
+    newFrame.create(frame0.size(), frame0.type());
+
+    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
+    buf.setTo(Scalar::all(0));
+    
+    // occlusion masks
+    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
+    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
+
+    // interpolated forward flow
+    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
+    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
+
+    // interpolated backward flow
+    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
+    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
+
+    size_t step = frame0.step;
+
+    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStStreamHandler h(stream);
+
+    NppStInterpolationState state;
+
+    state.size         = NcvSize32u(frame0.cols, frame0.rows);
+    state.nStep        = static_cast<Ncv32u>(step);
+    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
+    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
+    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
+    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
+    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
+    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
+    state.pos          = pos;
+    state.pNewFrame    = newFrame.ptr<Ncv32f>();
+    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
+    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
+    state.ppBuffers[2] = fui.ptr<Ncv32f>();
+    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
+    state.ppBuffers[4] = bui.ptr<Ncv32f>();
+    state.ppBuffers[5] = bvi.ptr<Ncv32f>();
+
+    nppSafeCall( nppiStInterpolateFrames(&state) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* HAVE_CUDA */
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -76,6 +76,7 @@
    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"
    #include "nvidia/NCVHaarObjectDetection.hpp"
+    #include "nvidia/NCVBroxOpticalFlow.hpp"

 #define CUDART_MINIMUM_REQUIRED_VERSION 4000
 #define NPP_MINIMUM_REQUIRED_VERSION 4000