gpuvideo module for video processing

2013-04-10 10:59:25 +04:00
parent 7544ddbfef
commit fc1fa28556
41 changed files with 1033 additions and 525 deletions
--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@@ -1,168 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::GMG_GPU::GMG_GPU() { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::initialize(cv::Size, float, float) { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, cv::gpu::Stream&) { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace bgfg_gmg
-    {
-        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
-                           float decisionThreshold, int maxFeatures, int numInitializationFrames);
-
-        template <typename SrcT>
-        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    }
-}}}
-
-cv::gpu::GMG_GPU::GMG_GPU()
-{
-    maxFeatures = 64;
-    learningRate = 0.025f;
-    numInitializationFrames = 120;
-    quantizationLevels = 16;
-    backgroundPrior = 0.8f;
-    decisionThreshold = 0.8f;
-    smoothingRadius = 7;
-    updateBackgroundModel = true;
-}
-
-void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
-{
-    using namespace cv::gpu::cudev::bgfg_gmg;
-
-    CV_Assert(min < max);
-    CV_Assert(maxFeatures > 0);
-    CV_Assert(learningRate >= 0.0f && learningRate <= 1.0f);
-    CV_Assert(numInitializationFrames >= 1);
-    CV_Assert(quantizationLevels >= 1 && quantizationLevels <= 255);
-    CV_Assert(backgroundPrior >= 0.0f && backgroundPrior <= 1.0f);
-
-    minVal_ = min;
-    maxVal_ = max;
-
-    frameSize_ = frameSize;
-
-    frameNum_ = 0;
-
-    nfeatures_.create(frameSize_, CV_32SC1);
-    colors_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32SC1);
-    weights_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32FC1);
-
-    nfeatures_.setTo(cv::Scalar::all(0));
-
-    if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
-
-    loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
-}
-
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float newLearningRate, cv::gpu::Stream& stream)
-{
-    using namespace cv::gpu::cudev::bgfg_gmg;
-
-    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    static const func_t funcs[6][4] =
-    {
-        {update_gpu<uchar>, 0, update_gpu<uchar3>, update_gpu<uchar4>},
-        {0,0,0,0},
-        {update_gpu<ushort>, 0, update_gpu<ushort3>, update_gpu<ushort4>},
-        {0,0,0,0},
-        {0,0,0,0},
-        {update_gpu<float>, 0, update_gpu<float3>, update_gpu<float4>}
-    };
-
-    CV_Assert(frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F);
-    CV_Assert(frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4);
-
-    if (newLearningRate != -1.0f)
-    {
-        CV_Assert(newLearningRate >= 0.0f && newLearningRate <= 1.0f);
-        learningRate = newLearningRate;
-    }
-
-    if (frame.size() != frameSize_)
-        initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0f);
-
-    fgmask.create(frameSize_, CV_8UC1);
-    if (stream)
-        stream.enqueueMemSet(fgmask, cv::Scalar::all(0));
-    else
-        fgmask.setTo(cv::Scalar::all(0));
-
-    funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream));
-
-    // medianBlur
-    if (smoothingRadius > 0)
-    {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
-        int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
-        double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
-        cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
-    }
-
-    // keep track of how many frames we have processed
-    ++frameNum_;
-}
-
-void cv::gpu::GMG_GPU::release()
-{
-    frameSize_ = Size();
-
-    nfeatures_.release();
-    colors_.release();
-    weights_.release();
-    boxFilter_.release();
-    buf_.release();
-}
-
-#endif
--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@@ -1,279 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::MOG_GPU::MOG_GPU(int) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, Stream&) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::release() {}
-
-cv::gpu::MOG2_GPU::MOG2_GPU(int) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::operator()(const GpuMat&, GpuMat&, float, Stream&) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mog
-    {
-        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
-                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
-                     cudaStream_t stream);
-        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
-
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-namespace mog
-{
-    const int defaultNMixtures = 5;
-    const int defaultHistory = 200;
-    const float defaultBackgroundRatio = 0.7f;
-    const float defaultVarThreshold = 2.5f * 2.5f;
-    const float defaultNoiseSigma = 30.0f * 0.5f;
-    const float defaultInitialWeight = 0.05f;
-}
-
-cv::gpu::MOG_GPU::MOG_GPU(int nmixtures) :
-    frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = std::min(nmixtures > 0 ? nmixtures : mog::defaultNMixtures, 8);
-    history = mog::defaultHistory;
-    varThreshold = mog::defaultVarThreshold;
-    backgroundRatio = mog::defaultBackgroundRatio;
-    noiseSigma = mog::defaultNoiseSigma;
-}
-
-void cv::gpu::MOG_GPU::initialize(cv::Size frameSize, int frameType)
-{
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store
-    // the mixture sort key (w/sum_of_variances), the mixture weight (w),
-    // the mean (nchannels values) and
-    // the diagonal covariance matrix (another nchannels values)
-
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    sortKey_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-    var_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    weight_.setTo(cv::Scalar::all(0));
-    sortKey_.setTo(cv::Scalar::all(0));
-    mean_.setTo(cv::Scalar::all(0));
-    var_.setTo(cv::Scalar::all(0));
-
-    nframes_ = 0;
-}
-
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float learningRate, Stream& stream)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    CV_Assert(frame.depth() == CV_8U);
-
-    int ch = frame.channels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog_gpu(frame, ch, fgmask, weight_, sortKey_, mean_, var_, nmixtures_,
-            varThreshold, learningRate, backgroundRatio, noiseSigma,
-            StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    getBackgroundImage_gpu(backgroundImage.channels(), weight_, mean_, backgroundImage, nmixtures_, backgroundRatio, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    sortKey_.release();
-    mean_.release();
-    var_.release();
-}
-
-/////////////////////////////////////////////////////////////////
-// MOG2
-
-namespace mog2
-{
-    // default parameters of gaussian background detection algorithm
-    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
-    const float defaultVarThreshold = 4.0f * 4.0f;
-    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
-    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
-    const float defaultVarThresholdGen = 3.0f * 3.0f;
-    const float defaultVarInit = 15.0f; // initial variance for new components
-    const float defaultVarMax = 5.0f * defaultVarInit;
-    const float defaultVarMin = 4.0f;
-
-    // additional parameters
-    const float defaultfCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
-    const unsigned char defaultnShadowDetection = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
-    const float defaultfTau = 0.5f; // Tau - shadow threshold, see the paper for explanation
-}
-
-cv::gpu::MOG2_GPU::MOG2_GPU(int nmixtures) :
-    frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = nmixtures > 0 ? nmixtures : mog2::defaultNMixtures;
-
-    history = mog2::defaultHistory;
-    varThreshold = mog2::defaultVarThreshold;
-    bShadowDetection = true;
-
-    backgroundRatio = mog2::defaultBackgroundRatio;
-    fVarInit = mog2::defaultVarInit;
-    fVarMax  = mog2::defaultVarMax;
-    fVarMin = mog2::defaultVarMin;
-
-    varThresholdGen = mog2::defaultVarThresholdGen;
-    fCT = mog2::defaultfCT;
-    nShadowDetection =  mog2::defaultnShadowDetection;
-    fTau = mog2::defaultfTau;
-}
-
-void cv::gpu::MOG2_GPU::initialize(cv::Size frameSize, int frameType)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-    nframes_ = 0;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store ...
-    // the mixture weight (w),
-    // the mean (nchannels values) and
-    // the covariance
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    //make the array for keeping track of the used modes per pixel - all zeros at start
-    bgmodelUsedModes_.create(frameSize_, CV_8UC1);
-    bgmodelUsedModes_.setTo(cv::Scalar::all(0));
-
-    loadConstants(nmixtures_, varThreshold, backgroundRatio, varThresholdGen, fVarInit, fVarMin, fVarMax, fTau, nShadowDetection);
-}
-
-void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate, Stream& stream)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    int ch = frame.channels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0f || frame.size() != frameSize_ || work_ch != mean_.channels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-    fgmask.setTo(cv::Scalar::all(0));
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(2 * nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_, learningRate, -learningRate * fCT, bShadowDetection, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG2_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    variance_.release();
-    mean_.release();
-
-    bgmodelUsedModes_.release();
-}
-
-#endif
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace bgfg_gmg
-    {
-        __constant__ int   c_width;
-        __constant__ int   c_height;
-        __constant__ float c_minVal;
-        __constant__ float c_maxVal;
-        __constant__ int   c_quantizationLevels;
-        __constant__ float c_backgroundPrior;
-        __constant__ float c_decisionThreshold;
-        __constant__ int   c_maxFeatures;
-        __constant__ int   c_numInitializationFrames;
-
-        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
-                           float decisionThreshold, int maxFeatures, int numInitializationFrames)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_width, &width, sizeof(width)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_height, &height, sizeof(height)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_minVal, &minVal, sizeof(minVal)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_maxVal, &maxVal, sizeof(maxVal)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_quantizationLevels, &quantizationLevels, sizeof(quantizationLevels)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_backgroundPrior, &backgroundPrior, sizeof(backgroundPrior)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_decisionThreshold, &decisionThreshold, sizeof(decisionThreshold)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_maxFeatures, &maxFeatures, sizeof(maxFeatures)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_numInitializationFrames, &numInitializationFrames, sizeof(numInitializationFrames)) );
-        }
-
-        __device__ float findFeature(const int color, const PtrStepi& colors, const PtrStepf& weights, const int x, const int y, const int nfeatures)
-        {
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-            {
-                if (color == colors(fy, x))
-                    return weights(fy, x);
-            }
-
-            // not in histogram, so return 0.
-            return 0.0f;
-        }
-
-        __device__ void normalizeHistogram(PtrStepf weights, const int x, const int y, const int nfeatures)
-        {
-            float total = 0.0f;
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                total += weights(fy, x);
-
-            if (total != 0.0f)
-            {
-                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                    weights(fy, x) /= total;
-            }
-        }
-
-        __device__ bool insertFeature(const int color, const float weight, PtrStepi colors, PtrStepf weights, const int x, const int y, int& nfeatures)
-        {
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-            {
-                if (color == colors(fy, x))
-                {
-                    // feature in histogram
-
-                    weights(fy, x) += weight;
-
-                    return false;
-                }
-            }
-
-            if (nfeatures == c_maxFeatures)
-            {
-                // discard oldest feature
-
-                int idx = -1;
-                float minVal = numeric_limits<float>::max();
-                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                {
-                    const float w = weights(fy, x);
-                    if (w < minVal)
-                    {
-                        minVal = w;
-                        idx = fy;
-                    }
-                }
-
-                colors(idx, x) = color;
-                weights(idx, x) = weight;
-
-                return false;
-            }
-
-            colors(nfeatures * c_height + y, x) = color;
-            weights(nfeatures * c_height + y, x) = weight;
-
-            ++nfeatures;
-
-            return true;
-        }
-
-        namespace detail
-        {
-            template <int cn> struct Quantization
-            {
-                template <typename T>
-                __device__ static int apply(const T& val)
-                {
-                    int res = 0;
-                    res |= static_cast<int>((val.x - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
-                    res |= static_cast<int>((val.y - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 8;
-                    res |= static_cast<int>((val.z - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 16;
-                    return res;
-                }
-            };
-
-            template <> struct Quantization<1>
-            {
-                template <typename T>
-                __device__ static int apply(T val)
-                {
-                    return static_cast<int>((val - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
-                }
-            };
-        }
-
-        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};
-
-        template <typename SrcT>
-        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
-                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= c_width || y >= c_height)
-                return;
-
-            const SrcT pix = frame(y, x);
-            const int newFeatureColor = Quantization<SrcT>::apply(pix);
-
-            int nfeatures = nfeatures_(y, x);
-
-            if (frameNum >= c_numInitializationFrames)
-            {
-                // typical operation
-
-                const float weight = findFeature(newFeatureColor, colors_, weights_, x, y, nfeatures);
-
-                // see Godbehere, Matsukawa, Goldberg (2012) for reasoning behind this implementation of Bayes rule
-                const float posterior = (weight * c_backgroundPrior) / (weight * c_backgroundPrior + (1.0f - weight) * (1.0f - c_backgroundPrior));
-
-                const bool isForeground = ((1.0f - posterior) > c_decisionThreshold);
-                fgmask(y, x) = (uchar)(-isForeground);
-
-                // update histogram.
-
-                if (updateBackgroundModel)
-                {
-                    for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                        weights_(fy, x) *= 1.0f - learningRate;
-
-                    bool inserted = insertFeature(newFeatureColor, learningRate, colors_, weights_, x, y, nfeatures);
-
-                    if (inserted)
-                    {
-                        normalizeHistogram(weights_, x, y, nfeatures);
-                        nfeatures_(y, x) = nfeatures;
-                    }
-                }
-            }
-            else if (updateBackgroundModel)
-            {
-                // training-mode update
-
-                insertFeature(newFeatureColor, 1.0f, colors_, weights_, x, y, nfeatures);
-
-                if (frameNum == c_numInitializationFrames - 1)
-                    normalizeHistogram(weights_, x, y, nfeatures);
-            }
-        }
-
-        template <typename SrcT>
-        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
-        {
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );
-
-            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-
-        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-
-        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@@ -1,764 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mog
-    {
-        ///////////////////////////////////////////////////////////////
-        // Utility
-
-        __device__ __forceinline__ float cvt(uchar val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ float3 cvt(const uchar3& val)
-        {
-            return make_float3(val.x, val.y, val.z);
-        }
-        __device__ __forceinline__ float4 cvt(const uchar4& val)
-        {
-            return make_float4(val.x, val.y, val.z, val.w);
-        }
-
-        __device__ __forceinline__ float sqr(float val)
-        {
-            return val * val;
-        }
-        __device__ __forceinline__ float sqr(const float3& val)
-        {
-            return val.x * val.x + val.y * val.y + val.z * val.z;
-        }
-        __device__ __forceinline__ float sqr(const float4& val)
-        {
-            return val.x * val.x + val.y * val.y + val.z * val.z;
-        }
-
-        __device__ __forceinline__ float sum(float val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ float sum(const float3& val)
-        {
-            return val.x + val.y + val.z;
-        }
-        __device__ __forceinline__ float sum(const float4& val)
-        {
-            return val.x + val.y + val.z;
-        }
-
-        __device__ __forceinline__ float clamp(float var, float learningRate, float diff, float minVar)
-        {
-             return ::fmaxf(var + learningRate * (diff * diff - var), minVar);
-        }
-        __device__ __forceinline__ float3 clamp(const float3& var, float learningRate, const float3& diff, float minVar)
-        {
-             return make_float3(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
-                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
-                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar));
-        }
-        __device__ __forceinline__ float4 clamp(const float4& var, float learningRate, const float4& diff, float minVar)
-        {
-             return make_float4(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
-                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
-                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar),
-                                0.0f);
-        }
-
-        template <class Ptr2D>
-        __device__ __forceinline__ void swap(Ptr2D& ptr, int x, int y, int k, int rows)
-        {
-            typename Ptr2D::elem_type val = ptr(k * rows + y, x);
-            ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
-            ptr((k + 1) * rows + y, x) = val;
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG without learning
-
-        template <typename SrcT, typename WorkT>
-        __global__ void mog_withoutLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
-                                            const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, const PtrStep<WorkT> gmm_var,
-                                            const int nmixtures, const float varThreshold, const float backgroundRatio)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            int kHit = -1;
-            int kForeground = -1;
-
-            for (int k = 0; k < nmixtures; ++k)
-            {
-                if (gmm_weight(k * frame.rows + y, x) < numeric_limits<float>::epsilon())
-                    break;
-
-                WorkT mu = gmm_mean(k * frame.rows + y, x);
-                WorkT var = gmm_var(k * frame.rows + y, x);
-
-                WorkT diff = pix - mu;
-
-                if (sqr(diff) < varThreshold * sum(var))
-                {
-                    kHit = k;
-                    break;
-                }
-            }
-
-            if (kHit >= 0)
-            {
-                float wsum = 0.0f;
-                for (int k = 0; k < nmixtures; ++k)
-                {
-                    wsum += gmm_weight(k * frame.rows + y, x);
-
-                    if (wsum > backgroundRatio)
-                    {
-                        kForeground = k + 1;
-                        break;
-                    }
-                }
-            }
-
-            fgmask(y, x) = (uchar) (-(kHit < 0 || kHit >= kForeground));
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog_withoutLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var,
-                                        int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(mog_withoutLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
-                                                                         weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
-                                                                         nmixtures, varThreshold, backgroundRatio);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG with learning
-
-        template <typename SrcT, typename WorkT>
-        __global__ void mog_withLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
-                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep<WorkT> gmm_mean, PtrStep<WorkT> gmm_var,
-                                         const int nmixtures, const float varThreshold, const float backgroundRatio, const float learningRate, const float minVar)
-        {
-            const float w0 = 0.05f;
-            const float sk0 = w0 / (30.0f * 0.5f * 2.0f);
-            const float var0 = 30.0f * 0.5f * 30.0f * 0.5f * 4.0f;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            float wsum = 0.0f;
-            int kHit = -1;
-            int kForeground = -1;
-
-            int k = 0;
-            for (; k < nmixtures; ++k)
-            {
-                float w = gmm_weight(k * frame.rows + y, x);
-                wsum += w;
-
-                if (w < numeric_limits<float>::epsilon())
-                    break;
-
-                WorkT mu = gmm_mean(k * frame.rows + y, x);
-                WorkT var = gmm_var(k * frame.rows + y, x);
-
-                WorkT diff = pix - mu;
-
-                if (sqr(diff) < varThreshold * sum(var))
-                {
-                    wsum -= w;
-                    float dw = learningRate * (1.0f - w);
-
-                    var = clamp(var, learningRate, diff, minVar);
-
-                    float sortKey_prev = w / ::sqrtf(sum(var));
-                    gmm_sortKey(k * frame.rows + y, x) = sortKey_prev;
-
-                    float weight_prev = w + dw;
-                    gmm_weight(k * frame.rows + y, x) = weight_prev;
-
-                    WorkT mean_prev = mu + learningRate * diff;
-                    gmm_mean(k * frame.rows + y, x) = mean_prev;
-
-                    WorkT var_prev = var;
-                    gmm_var(k * frame.rows + y, x) = var_prev;
-
-                    int k1 = k - 1;
-
-                    if (k1 >= 0)
-                    {
-                        float sortKey_next = gmm_sortKey(k1 * frame.rows + y, x);
-                        float weight_next = gmm_weight(k1 * frame.rows + y, x);
-                        WorkT mean_next = gmm_mean(k1 * frame.rows + y, x);
-                        WorkT var_next = gmm_var(k1 * frame.rows + y, x);
-
-                        for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
-                        {
-                            gmm_sortKey(k1 * frame.rows + y, x) = sortKey_prev;
-                            gmm_sortKey((k1 + 1) * frame.rows + y, x) = sortKey_next;
-
-                            gmm_weight(k1 * frame.rows + y, x) = weight_prev;
-                            gmm_weight((k1 + 1) * frame.rows + y, x) = weight_next;
-
-                            gmm_mean(k1 * frame.rows + y, x) = mean_prev;
-                            gmm_mean((k1 + 1) * frame.rows + y, x) = mean_next;
-
-                            gmm_var(k1 * frame.rows + y, x) = var_prev;
-                            gmm_var((k1 + 1) * frame.rows + y, x) = var_next;
-
-                            sortKey_prev = sortKey_next;
-                            sortKey_next = k1 > 0 ? gmm_sortKey((k1 - 1) * frame.rows + y, x) : 0.0f;
-
-                            weight_prev = weight_next;
-                            weight_next = k1 > 0 ? gmm_weight((k1 - 1) * frame.rows + y, x) : 0.0f;
-
-                            mean_prev = mean_next;
-                            mean_next = k1 > 0 ? gmm_mean((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
-
-                            var_prev = var_next;
-                            var_next = k1 > 0 ? gmm_var((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
-                        }
-                    }
-
-                    kHit = k1 + 1;
-                    break;
-                }
-            }
-
-            if (kHit < 0)
-            {
-                // no appropriate gaussian mixture found at all, remove the weakest mixture and create a new one
-                kHit = k = ::min(k, nmixtures - 1);
-                wsum += w0 - gmm_weight(k * frame.rows + y, x);
-
-                gmm_weight(k * frame.rows + y, x) = w0;
-                gmm_mean(k * frame.rows + y, x) = pix;
-                gmm_var(k * frame.rows + y, x) = VecTraits<WorkT>::all(var0);
-                gmm_sortKey(k * frame.rows + y, x) = sk0;
-            }
-            else
-            {
-                for( ; k < nmixtures; k++)
-                    wsum += gmm_weight(k * frame.rows + y, x);
-            }
-
-            float wscale = 1.0f / wsum;
-            wsum = 0;
-            for (k = 0; k < nmixtures; ++k)
-            {
-                float w = gmm_weight(k * frame.rows + y, x);
-                wsum += w *= wscale;
-
-                gmm_weight(k * frame.rows + y, x) = w;
-                gmm_sortKey(k * frame.rows + y, x) *= wscale;
-
-                if (wsum > backgroundRatio && kForeground < 0)
-                    kForeground = k + 1;
-            }
-
-            fgmask(y, x) = (uchar)(-(kHit >= kForeground));
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog_withLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
-                                     int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar,
-                                     cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(mog_withLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
-                                                                      weight, sortKey, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
-                                                                      nmixtures, varThreshold, backgroundRatio, learningRate, minVar);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG
-
-        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
-        {
-            typedef void (*withoutLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
-            typedef void (*withLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);
-
-            static const withoutLearning_t withoutLearning[] =
-            {
-                0, mog_withoutLearning_caller<uchar, float>, 0, mog_withoutLearning_caller<uchar3, float3>, mog_withoutLearning_caller<uchar4, float4>
-            };
-            static const withLearning_t withLearning[] =
-            {
-                0, mog_withLearning_caller<uchar, float>, 0, mog_withLearning_caller<uchar3, float3>, mog_withLearning_caller<uchar4, float4>
-            };
-
-            const float minVar = noiseSigma * noiseSigma;
-
-            if (learningRate > 0.0f)
-                withLearning[cn](frame, fgmask, weight, sortKey, mean, var, nmixtures, varThreshold, backgroundRatio, learningRate, minVar, stream);
-            else
-                withoutLearning[cn](frame, fgmask, weight, mean, var, nmixtures, varThreshold, backgroundRatio, stream);
-        }
-
-        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStepSz<OutT> dst, const int nmixtures, const float backgroundRatio)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-            float totalWeight = 0.0f;
-
-            for (int mode = 0; mode < nmixtures; ++mode)
-            {
-                float weight = gmm_weight(mode * dst.rows + y, x);
-
-                WorkT mean = gmm_mean(mode * dst.rows + y, x);
-                meanVal = meanVal + weight * mean;
-
-                totalWeight += weight;
-
-                if(totalWeight > backgroundRatio)
-                    break;
-            }
-
-            meanVal = meanVal * (1.f / totalWeight);
-
-            dst(y, x) = saturate_cast<OutT>(meanVal);
-        }
-
-        template <typename WorkT, typename OutT>
-        void getBackgroundImage_caller(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage<WorkT, OutT>, cudaFuncCachePreferL1) );
-
-            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst, nmixtures, backgroundRatio);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, getBackgroundImage_caller<float, uchar>, 0, getBackgroundImage_caller<float3, uchar3>, getBackgroundImage_caller<float4, uchar4>
-            };
-
-            funcs[cn](weight, mean, dst, nmixtures, backgroundRatio, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG2
-
-        __constant__ int           c_nmixtures;
-        __constant__ float         c_Tb;
-        __constant__ float         c_TB;
-        __constant__ float         c_Tg;
-        __constant__ float         c_varInit;
-        __constant__ float         c_varMin;
-        __constant__ float         c_varMax;
-        __constant__ float         c_tau;
-        __constant__ unsigned char c_shadowVal;
-
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
-        {
-            varMin = ::fminf(varMin, varMax);
-            varMax = ::fmaxf(varMin, varMax);
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_nmixtures, &nmixtures, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tb, &Tb, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_TB, &TB, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tg, &Tg, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varInit, &varInit, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMin, &varMin, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMax, &varMax, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_tau, &tau, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_shadowVal, &shadowVal, sizeof(unsigned char)) );
-        }
-
-        template <bool detectShadows, typename SrcT, typename WorkT>
-        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
-                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
-                             const float alphaT, const float alpha1, const float prune)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            //calculate distances to the modes (+ sort)
-            //here we need to go in descending order!!!
-
-            bool background = false; // true - the pixel classified as background
-
-            //internal:
-
-            bool fitsPDF = false; //if it remains zero a new GMM mode will be added
-
-            int nmodes = modesUsed(y, x);
-            int nNewModes = nmodes; //current number of modes in GMM
-
-            float totalWeight = 0.0f;
-
-            //go through all modes
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                //need only weight if fit is found
-                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
-
-                //fit not found yet
-                if (!fitsPDF)
-                {
-                    //check if it belongs to some of the remaining modes
-                    float var = gmm_variance(mode * frame.rows + y, x);
-
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-
-                    //calculate difference and distance
-                    WorkT diff = mean - pix;
-                    float dist2 = sqr(diff);
-
-                    //background? - Tb - usually larger than Tg
-                    if (totalWeight < c_TB && dist2 < c_Tb * var)
-                        background = true;
-
-                    //check fit
-                    if (dist2 < c_Tg * var)
-                    {
-                        //belongs to the mode
-                        fitsPDF = true;
-
-                        //update distribution
-
-                        //update weight
-                        weight += alphaT;
-                        float k = alphaT / weight;
-
-                        //update mean
-                        gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
-
-                        //update variance
-                        float varnew = var + k * (dist2 - var);
-
-                        //limit the variance
-                        varnew = ::fmaxf(varnew, c_varMin);
-                        varnew = ::fminf(varnew, c_varMax);
-
-                        gmm_variance(mode * frame.rows + y, x) = varnew;
-
-                        //sort
-                        //all other weights are at the same place and
-                        //only the matched (iModes) is higher -> just find the new place for it
-
-                        for (int i = mode; i > 0; --i)
-                        {
-                            //check one up
-                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
-                                break;
-
-                            //swap one up
-                            swap(gmm_weight, x, y, i - 1, frame.rows);
-                            swap(gmm_variance, x, y, i - 1, frame.rows);
-                            swap(gmm_mean, x, y, i - 1, frame.rows);
-                        }
-
-                        //belongs to the mode - bFitsPDF becomes 1
-                    }
-                } // !fitsPDF
-
-                //check prune
-                if (weight < -prune)
-                {
-                    weight = 0.0;
-                    nmodes--;
-                }
-
-                gmm_weight(mode * frame.rows + y, x) = weight; //update weight by the calculated value
-                totalWeight += weight;
-            }
-
-            //renormalize weights
-
-            totalWeight = 1.f / totalWeight;
-            for (int mode = 0; mode < nmodes; ++mode)
-                gmm_weight(mode * frame.rows + y, x) *= totalWeight;
-
-            nmodes = nNewModes;
-
-            //make new mode if needed and exit
-
-            if (!fitsPDF)
-            {
-                // replace the weakest or add a new one
-                int mode = nmodes == c_nmixtures ? c_nmixtures - 1 : nmodes++;
-
-                if (nmodes == 1)
-                    gmm_weight(mode * frame.rows + y, x) = 1.f;
-                else
-                {
-                    gmm_weight(mode * frame.rows + y, x) = alphaT;
-
-                    // renormalize all other weights
-
-                    for (int i = 0; i < nmodes - 1; ++i)
-                        gmm_weight(i * frame.rows + y, x) *= alpha1;
-                }
-
-                // init
-
-                gmm_mean(mode * frame.rows + y, x) = pix;
-                gmm_variance(mode * frame.rows + y, x) = c_varInit;
-
-                //sort
-                //find the new place for it
-
-                for (int i = nmodes - 1; i > 0; --i)
-                {
-                    // check one up
-                    if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
-                        break;
-
-                    //swap one up
-                    swap(gmm_weight, x, y, i - 1, frame.rows);
-                    swap(gmm_variance, x, y, i - 1, frame.rows);
-                    swap(gmm_mean, x, y, i - 1, frame.rows);
-                }
-            }
-
-            //set the number of modes
-            modesUsed(y, x) = nmodes;
-
-            bool isShadow = false;
-            if (detectShadows && !background)
-            {
-                float tWeight = 0.0f;
-
-                // check all the components  marked as background:
-                for (int mode = 0; mode < nmodes; ++mode)
-                {
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-
-                    WorkT pix_mean = pix * mean;
-
-                    float numerator = sum(pix_mean);
-                    float denominator = sqr(mean);
-
-                    // no division by zero allowed
-                    if (denominator == 0)
-                        break;
-
-                    // if tau < a < 1 then also check the color distortion
-                    if (numerator <= denominator && numerator >= c_tau * denominator)
-                    {
-                        float a = numerator / denominator;
-
-                        WorkT dD = a * mean - pix;
-
-                        if (sqr(dD) < c_Tb * gmm_variance(mode * frame.rows + y, x) * a * a)
-                        {
-                            isShadow = true;
-                            break;
-                        }
-                    };
-
-                    tWeight += gmm_weight(mode * frame.rows + y, x);
-                    if (tWeight > c_TB)
-                        break;
-                }
-            }
-
-            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            const float alpha1 = 1.0f - alphaT;
-
-            if (detectShadows)
-            {
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
-                                                                    alphaT, alpha1, prune);
-            }
-            else
-            {
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
-                                                                    alphaT, alpha1, prune);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>
-            };
-
-            funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, stream);
-        }
-
-        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= modesUsed.cols || y >= modesUsed.rows)
-                return;
-
-            int nmodes = modesUsed(y, x);
-
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-            float totalWeight = 0.0f;
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                float weight = gmm_weight(mode * modesUsed.rows + y, x);
-
-                WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
-                meanVal = meanVal + weight * mean;
-
-                totalWeight += weight;
-
-                if(totalWeight > c_TB)
-                    break;
-            }
-
-            meanVal = meanVal * (1.f / totalWeight);
-
-            dst(y, x) = saturate_cast<OutT>(meanVal);
-        }
-
-        template <typename WorkT, typename OutT>
-        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );
-
-            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>
-            };
-
-            funcs[cn](modesUsed, weight, mean, dst, stream);
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -1,801 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "fgd_bgfg_common.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace bgfg
-{
-    ////////////////////////////////////////////////////////////////////////////
-    // calcDiffHistogram
-
-    const unsigned int UINT_BITS = 32U;
-    const int LOG_WARP_SIZE = 5;
-    const int WARP_SIZE = 1 << LOG_WARP_SIZE;
-#if (__CUDA_ARCH__ < 120)
-    const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
-#endif
-
-    const int MERGE_THREADBLOCK_SIZE = 256;
-
-    __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
-    {
-        #if (__CUDA_ARCH__ < 120)
-            volatile unsigned int* s_WarpHist = s_WarpHist_;
-            unsigned int count;
-            do
-            {
-                count = s_WarpHist[data] & TAG_MASK;
-                count = threadTag | (count + 1);
-                s_WarpHist[data] = count;
-            } while (s_WarpHist[data] != count);
-        #else
-            atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
-        #endif
-    }
-
-
-    template <typename PT, typename CT>
-    __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
-    {
-#if (__CUDA_ARCH__ < 200)
-        const int HISTOGRAM_WARP_COUNT = 4;
-#else
-        const int HISTOGRAM_WARP_COUNT = 6;
-#endif
-        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
-        const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
-
-        //Per-warp subhistogram storage
-        __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
-        __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
-        __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
-
-        //Clear shared memory storage for current threadblock before processing
-        #pragma unroll
-        for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
-        {
-           s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-           s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-           s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-        }
-        __syncthreads();
-
-        const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
-
-        unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
-        unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
-        unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
-
-        const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
-        const int dataCount = prevFrame.rows * prevFrame.cols;
-        for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
-        {
-            const unsigned int y = pos / prevFrame.cols;
-            const unsigned int x = pos % prevFrame.cols;
-
-            PT prevVal = prevFrame(y, x);
-            CT curVal = curFrame(y, x);
-
-            int3 diff = make_int3(
-                ::abs(curVal.x - prevVal.x),
-                ::abs(curVal.y - prevVal.y),
-                ::abs(curVal.z - prevVal.z)
-            );
-
-            addByte(s_WarpHist0, diff.x, tag);
-            addByte(s_WarpHist1, diff.y, tag);
-            addByte(s_WarpHist2, diff.z, tag);
-        }
-        __syncthreads();
-
-        //Merge per-warp histograms into per-block and write to global memory
-        for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
-        {
-            unsigned int sum0 = 0;
-            unsigned int sum1 = 0;
-            unsigned int sum2 = 0;
-
-            #pragma unroll
-            for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
-            {
-                #if (__CUDA_ARCH__ < 120)
-                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                #else
-                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
-                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
-                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
-                #endif
-            }
-
-            partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
-            partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
-            partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
-        }
-    }
-
-    __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
-    {
-        unsigned int sum0 = 0;
-        unsigned int sum1 = 0;
-        unsigned int sum2 = 0;
-
-        #pragma unroll
-        for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
-        {
-            sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-            sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-            sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-        }
-
-        __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
-        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
-        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
-
-        plus<unsigned int> op;
-        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
-
-        if(threadIdx.x == 0)
-        {
-            hist0[blockIdx.x] = sum0;
-            hist1[blockIdx.x] = sum1;
-            hist2[blockIdx.x] = sum2;
-        }
-    }
-
-    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
-                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
-                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               bool cc20, cudaStream_t stream)
-    {
-        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
-        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
-
-        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
-                (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
-        cudaSafeCall( cudaGetLastError() );
-
-        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-
-    /////////////////////////////////////////////////////////////////////////
-    // calcDiffThreshMask
-
-    template <typename PT, typename CT>
-    __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (y > prevFrame.rows || x > prevFrame.cols)
-            return;
-
-        PT prevVal = prevFrame(y, x);
-        CT curVal = curFrame(y, x);
-
-        int3 diff = make_int3(
-            ::abs(curVal.x - prevVal.x),
-            ::abs(curVal.y - prevVal.y),
-            ::abs(curVal.z - prevVal.z)
-        );
-
-        if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
-            changeMask(y, x) = 255;
-    }
-
-    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-
-    /////////////////////////////////////////////////////////////////////////
-    // bgfgClassification
-
-    __constant__ BGPixelStat c_stat;
-
-    void setBGPixelStat(const BGPixelStat& stat)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
-    }
-
-    template <typename T> struct Output;
-    template <> struct Output<uchar3>
-    {
-        static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
-        {
-            return make_uchar3(v0, v1, v2);
-        }
-    };
-    template <> struct Output<uchar4>
-    {
-        static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
-        {
-            return make_uchar4(v0, v1, v2, 255);
-        }
-    };
-
-    template <typename PT, typename CT, typename OT>
-    __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
-                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
-                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
-    {
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (i > prevFrame.rows || j > prevFrame.cols)
-            return;
-
-        if (Fbd(i, j) || Ftd(i, j))
-        {
-            float Pb  = 0.0f;
-            float Pv  = 0.0f;
-            float Pvb = 0.0f;
-
-            int val = 0;
-
-            // Is it a motion pixel?
-            if (Ftd(i, j))
-            {
-                if (!c_stat.is_trained_dyn_model(i, j))
-                    val = 1;
-                else
-                {
-                    PT prevVal = prevFrame(i, j);
-                    CT curVal = curFrame(i, j);
-
-                    // Compare with stored CCt vectors:
-                    for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
-                    {
-                        OT v1 = c_stat.V1_CC<OT>(i, j, k);
-                        OT v2 = c_stat.V2_CC<OT>(i, j, k);
-
-                        if (::abs(v1.x - prevVal.x) <= deltaCC &&
-                            ::abs(v1.y - prevVal.y) <= deltaCC &&
-                            ::abs(v1.z - prevVal.z) <= deltaCC &&
-                            ::abs(v2.x - curVal.x) <= deltaCC &&
-                            ::abs(v2.y - curVal.y) <= deltaCC &&
-                            ::abs(v2.z - curVal.z) <= deltaCC)
-                        {
-                            Pv += c_stat.PV_CC(i, j, k);
-                            Pvb += c_stat.PVB_CC(i, j, k);
-                        }
-                    }
-
-                    Pb = c_stat.Pbcc(i, j);
-                    if (2 * Pvb * Pb <= Pv)
-                        val = 1;
-                }
-            }
-            else if(c_stat.is_trained_st_model(i, j))
-            {
-                CT curVal = curFrame(i, j);
-
-                // Compare with stored Ct vectors:
-                for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
-                {
-                    OT v = c_stat.V_C<OT>(i, j, k);
-
-                    if (::abs(v.x - curVal.x) <= deltaC &&
-                        ::abs(v.y - curVal.y) <= deltaC &&
-                        ::abs(v.z - curVal.z) <= deltaC)
-                    {
-                        Pv += c_stat.PV_C(i, j, k);
-                        Pvb += c_stat.PVB_C(i, j, k);
-                    }
-                }
-                Pb = c_stat.Pbc(i, j);
-                if (2 * Pvb * Pb <= Pv)
-                    val = 1;
-            }
-
-            // Update foreground:
-            foreground(i, j) = static_cast<uchar>(val);
-        } // end if( change detection...
-    }
-
-    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
-                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
-
-        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
-                                                                   Ftd, Fbd, foreground,
-                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // updateBackgroundModel
-
-    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
-    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
-                                          PtrStepb foreground, PtrStep<OT> background,
-                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
-    {
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (i > rows || j > cols)
-            return;
-
-        const float MIN_PV = 1e-10f;
-
-        const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
-        if (Ftd(i, j) || !is_trained_dyn_model)
-        {
-            const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
-
-            float Pbcc = c_stat.Pbcc(i, j);
-
-            //update Pb
-            Pbcc *= (1.0f - alpha);
-            if (!foreground(i, j))
-            {
-                Pbcc += alpha;
-            }
-
-            int min_dist = numeric_limits<int>::max();
-            int indx = -1;
-
-            PT prevVal = prevFrame(i, j);
-            CT curVal = curFrame(i, j);
-
-            // Find best Vi match:
-            for (int k = 0; k < N2cc; ++k)
-            {
-                float PV_CC = c_stat.PV_CC(i, j, k);
-                if (!PV_CC)
-                    break;
-
-                if (PV_CC < MIN_PV)
-                {
-                    c_stat.PV_CC(i, j, k) = 0;
-                    c_stat.PVB_CC(i, j, k) = 0;
-                    continue;
-                }
-
-                c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
-                c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
-
-                OT v1 = c_stat.V1_CC<OT>(i, j, k);
-
-                int3 val1 = make_int3(
-                    ::abs(v1.x - prevVal.x),
-                    ::abs(v1.y - prevVal.y),
-                    ::abs(v1.z - prevVal.z)
-                );
-
-                OT v2 = c_stat.V2_CC<OT>(i, j, k);
-
-                int3 val2 = make_int3(
-                    ::abs(v2.x - curVal.x),
-                    ::abs(v2.y - curVal.y),
-                    ::abs(v2.z - curVal.z)
-                );
-
-                int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
-
-                if (dist < min_dist &&
-                    val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
-                    val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
-                {
-                    min_dist = dist;
-                    indx = k;
-                }
-            }
-
-            if (indx < 0)
-            {
-                // Replace N2th elem in the table by new feature:
-                indx = N2cc - 1;
-                c_stat.PV_CC(i, j, indx) = alpha;
-                c_stat.PVB_CC(i, j, indx) = alpha;
-
-                //udate Vt
-                c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
-                c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-            else
-            {
-                // Update:
-                c_stat.PV_CC(i, j, indx) += alpha;
-
-                if (!foreground(i, j))
-                {
-                    c_stat.PVB_CC(i, j, indx) += alpha;
-                }
-            }
-
-            //re-sort CCt table by Pv
-            const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
-            const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
-            const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
-            const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
-            for (int k = 0; k < indx; ++k)
-            {
-                if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
-                {
-                    //shift elements
-                    float Pv_tmp1;
-                    float Pv_tmp2 = PV_CC_indx;
-
-                    float Pvb_tmp1;
-                    float Pvb_tmp2 = PVB_CC_indx;
-
-                    OT v1_tmp1;
-                    OT v1_tmp2 = V1_CC_indx;
-
-                    OT v2_tmp1;
-                    OT v2_tmp2 = V2_CC_indx;
-
-                    for (int l = k; l <= indx; ++l)
-                    {
-                        Pv_tmp1 = c_stat.PV_CC(i, j, l);
-                        c_stat.PV_CC(i, j, l) = Pv_tmp2;
-                        Pv_tmp2 = Pv_tmp1;
-
-                        Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
-                        c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
-                        Pvb_tmp2 = Pvb_tmp1;
-
-                        v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
-                        c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
-                        v1_tmp2 = v1_tmp1;
-
-                        v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
-                        c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
-                        v2_tmp2 = v2_tmp1;
-                    }
-
-                    break;
-                }
-            }
-
-            float sum1 = 0.0f;
-            float sum2 = 0.0f;
-
-            //check "once-off" changes
-            for (int k = 0; k < N1cc; ++k)
-            {
-                const float PV_CC = c_stat.PV_CC(i, j, k);
-                if (!PV_CC)
-                    break;
-
-                sum1 += PV_CC;
-                sum2 += c_stat.PVB_CC(i, j, k);
-            }
-
-            if (sum1 > T)
-                c_stat.is_trained_dyn_model(i, j) = 1;
-
-            float diff = sum1 - Pbcc * sum2;
-
-            // Update stat table:
-            if (diff > T)
-            {
-                //new BG features are discovered
-                for (int k = 0; k < N1cc; ++k)
-                {
-                    const float PV_CC = c_stat.PV_CC(i, j, k);
-                    if (!PV_CC)
-                        break;
-
-                    c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
-                }
-            }
-
-            c_stat.Pbcc(i, j) = Pbcc;
-        }
-
-        // Handle "stationary" pixel:
-        if (!Ftd(i, j))
-        {
-            const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
-
-            float Pbc = c_stat.Pbc(i, j);
-
-            //update Pb
-            Pbc *= (1.0f - alpha);
-            if (!foreground(i, j))
-            {
-                Pbc += alpha;
-            }
-
-            int min_dist = numeric_limits<int>::max();
-            int indx = -1;
-
-            CT curVal = curFrame(i, j);
-
-            //find best Vi match
-            for (int k = 0; k < N2c; ++k)
-            {
-                float PV_C = c_stat.PV_C(i, j, k);
-
-                if (PV_C < MIN_PV)
-                {
-                    c_stat.PV_C(i, j, k) = 0;
-                    c_stat.PVB_C(i, j, k) = 0;
-                    continue;
-                }
-
-                // Exponential decay of memory
-                c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
-                c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
-
-                OT v = c_stat.V_C<OT>(i, j, k);
-                int3 val = make_int3(
-                    ::abs(v.x - curVal.x),
-                    ::abs(v.y - curVal.y),
-                    ::abs(v.z - curVal.z)
-                );
-
-                int dist = val.x + val.y + val.z;
-
-                if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
-                {
-                    min_dist = dist;
-                    indx = k;
-                }
-            }
-
-            if (indx < 0)
-            {
-                //N2th elem in the table is replaced by a new features
-                indx = N2c - 1;
-
-                c_stat.PV_C(i, j, indx) = alpha;
-                c_stat.PVB_C(i, j, indx) = alpha;
-
-                //udate Vt
-                c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-            else
-            {
-                //update
-                c_stat.PV_C(i, j, indx) += alpha;
-
-                if (!foreground(i, j))
-                {
-                    c_stat.PVB_C(i, j, indx) += alpha;
-                }
-            }
-
-            //re-sort Ct table by Pv
-            const float PV_C_indx = c_stat.PV_C(i, j, indx);
-            const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
-            OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
-            for (int k = 0; k < indx; ++k)
-            {
-                if (c_stat.PV_C(i, j, k) <= PV_C_indx)
-                {
-                    //shift elements
-                    float Pv_tmp1;
-                    float Pv_tmp2 = PV_C_indx;
-
-                    float Pvb_tmp1;
-                    float Pvb_tmp2 = PVB_C_indx;
-
-                    OT v_tmp1;
-                    OT v_tmp2 = V_C_indx;
-
-                    for (int l = k; l <= indx; ++l)
-                    {
-                        Pv_tmp1 = c_stat.PV_C(i, j, l);
-                        c_stat.PV_C(i, j, l) = Pv_tmp2;
-                        Pv_tmp2 = Pv_tmp1;
-
-                        Pvb_tmp1 = c_stat.PVB_C(i, j, l);
-                        c_stat.PVB_C(i, j, l) = Pvb_tmp2;
-                        Pvb_tmp2 = Pvb_tmp1;
-
-                        v_tmp1 = c_stat.V_C<OT>(i, j, l);
-                        c_stat.V_C<OT>(i, j, l) = v_tmp2;
-                        v_tmp2 = v_tmp1;
-                    }
-
-                    break;
-                }
-            }
-
-            // Check "once-off" changes:
-            float sum1 = 0.0f;
-            float sum2 = 0.0f;
-            for (int k = 0; k < N1c; ++k)
-            {
-                const float PV_C = c_stat.PV_C(i, j, k);
-                if (!PV_C)
-                    break;
-
-                sum1 += PV_C;
-                sum2 += c_stat.PVB_C(i, j, k);
-            }
-
-            if (sum1 > T)
-                c_stat.is_trained_st_model(i, j) = 1;
-
-            float diff = sum1 - Pbc * sum2;
-
-            // Update stat table:
-            if (diff > T)
-            {
-                //new BG features are discovered
-                for (int k = 0; k < N1c; ++k)
-                {
-                    const float PV_C = c_stat.PV_C(i, j, k);
-                    if (!PV_C)
-                        break;
-
-                    c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
-                }
-
-                c_stat.Pbc(i, j) = 1.0f - Pbc;
-            }
-            else
-            {
-                c_stat.Pbc(i, j) = Pbc;
-            }
-        } // if !(change detection) at pixel (i,j)
-
-        // Update the reference BG image:
-        if (!foreground(i, j))
-        {
-            CT curVal = curFrame(i, j);
-
-            if (!Ftd(i, j) && !Fbd(i, j))
-            {
-                // Apply IIR filter:
-                OT oldVal = background(i, j);
-
-                int3 newVal = make_int3(
-                    __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
-                    __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
-                    __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
-                );
-
-                background(i, j) = Output<OT>::make(
-                    static_cast<uchar>(newVal.x),
-                    static_cast<uchar>(newVal.y),
-                    static_cast<uchar>(newVal.z)
-                );
-            }
-            else
-            {
-                background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-        }
-    }
-
-    template <typename PT, typename CT, typename OT>
-    struct UpdateBackgroundModel
-    {
-        static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
-                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                         cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
-
-            updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
-                prevFrame.cols, prevFrame.rows,
-                prevFrame, curFrame,
-                Ftd, Fbd, foreground, background,
-                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
-                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                                   cudaStream_t stream)
-    {
-        UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
-                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
-    }
-
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -1,189 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __FGD_BGFG_COMMON_HPP__
-#define __FGD_BGFG_COMMON_HPP__
-
-#include "opencv2/core/cuda_devptrs.hpp"
-
-namespace bgfg
-{
-    struct BGPixelStat
-    {
-    public:
-#ifdef __CUDACC__
-        __device__ float& Pbc(int i, int j);
-        __device__ float& Pbcc(int i, int j);
-
-        __device__ unsigned char& is_trained_st_model(int i, int j);
-        __device__ unsigned char& is_trained_dyn_model(int i, int j);
-
-        __device__ float& PV_C(int i, int j, int k);
-        __device__ float& PVB_C(int i, int j, int k);
-        template <typename T> __device__ T& V_C(int i, int j, int k);
-
-        __device__ float& PV_CC(int i, int j, int k);
-        __device__ float& PVB_CC(int i, int j, int k);
-        template <typename T> __device__ T& V1_CC(int i, int j, int k);
-        template <typename T> __device__ T& V2_CC(int i, int j, int k);
-#endif
-
-        int rows_;
-
-        unsigned char* Pbc_data_;
-        size_t Pbc_step_;
-
-        unsigned char* Pbcc_data_;
-        size_t Pbcc_step_;
-
-        unsigned char* is_trained_st_model_data_;
-        size_t is_trained_st_model_step_;
-
-        unsigned char* is_trained_dyn_model_data_;
-        size_t is_trained_dyn_model_step_;
-
-        unsigned char* ctable_Pv_data_;
-        size_t ctable_Pv_step_;
-
-        unsigned char* ctable_Pvb_data_;
-        size_t ctable_Pvb_step_;
-
-        unsigned char* ctable_v_data_;
-        size_t ctable_v_step_;
-
-        unsigned char* cctable_Pv_data_;
-        size_t cctable_Pv_step_;
-
-        unsigned char* cctable_Pvb_data_;
-        size_t cctable_Pvb_step_;
-
-        unsigned char* cctable_v1_data_;
-        size_t cctable_v1_step_;
-
-        unsigned char* cctable_v2_data_;
-        size_t cctable_v2_step_;
-    };
-
-#ifdef __CUDACC__
-    __device__ __forceinline__ float& BGPixelStat::Pbc(int i, int j)
-    {
-        return *((float*)(Pbc_data_ + i * Pbc_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::Pbcc(int i, int j)
-    {
-        return *((float*)(Pbcc_data_ + i * Pbcc_step_) + j);
-    }
-
-    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_st_model(int i, int j)
-    {
-        return *((unsigned char*)(is_trained_st_model_data_ + i * is_trained_st_model_step_) + j);
-    }
-
-    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_dyn_model(int i, int j)
-    {
-        return *((unsigned char*)(is_trained_dyn_model_data_ + i * is_trained_dyn_model_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PV_C(int i, int j, int k)
-    {
-        return *((float*)(ctable_Pv_data_ + ((k * rows_) + i) * ctable_Pv_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PVB_C(int i, int j, int k)
-    {
-        return *((float*)(ctable_Pvb_data_ + ((k * rows_) + i) * ctable_Pvb_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V_C(int i, int j, int k)
-    {
-        return *((T*)(ctable_v_data_ + ((k * rows_) + i) * ctable_v_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PV_CC(int i, int j, int k)
-    {
-        return *((float*)(cctable_Pv_data_ + ((k * rows_) + i) * cctable_Pv_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PVB_CC(int i, int j, int k)
-    {
-        return *((float*)(cctable_Pvb_data_ + ((k * rows_) + i) * cctable_Pvb_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V1_CC(int i, int j, int k)
-    {
-        return *((T*)(cctable_v1_data_ + ((k * rows_) + i) * cctable_v1_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V2_CC(int i, int j, int k)
-    {
-        return *((T*)(cctable_v2_data_ + ((k * rows_) + i) * cctable_v2_step_) + j);
-    }
-#endif
-
-    const int PARTIAL_HISTOGRAM_COUNT = 240;
-    const int HISTOGRAM_BIN_COUNT = 256;
-
-    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
-                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               bool cc20, cudaStream_t stream);
-
-    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
-
-    void setBGPixelStat(const BGPixelStat& stat);
-
-    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
-                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-
-    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                   cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
-                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                                   cudaStream_t stream);
-}
-
-#endif // __FGD_BGFG_COMMON_HPP__
--- a/modules/gpu/src/cuda/optflowbm.cu
+++ b/modules/gpu/src/cuda/optflowbm.cu
@@ -1,414 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace optflowbm
-{
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
-    {
-        int s = 0;
-
-        for (int y = 0; y < blockSize.y; ++y)
-        {
-            for (int x = 0; x < blockSize.x; ++x)
-                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
-        }
-
-        return s;
-    }
-
-    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
-                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
-                                  const short2* ss, const int ssCount)
-    {
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (i >= velx.rows || j >= velx.cols)
-            return;
-
-        const int X1 = j * shiftSize.x;
-        const int Y1 = i * shiftSize.y;
-
-        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
-        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
-
-        int X2 = X1 + offX;
-        int Y2 = Y1 + offY;
-
-        int dist = numeric_limits<int>::max();
-
-        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
-
-        int countMin = 1;
-        int sumx = offX;
-        int sumy = offY;
-
-        if (dist > acceptLevel)
-        {
-            // do brute-force search
-            for (int k = 0; k < ssCount; ++k)
-            {
-                const short2 ssVal = ss[k];
-
-                const int dx = offX + ssVal.x;
-                const int dy = offY + ssVal.y;
-
-                X2 = X1 + dx;
-                Y2 = Y1 + dy;
-
-                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-                {
-                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
-                    if (tmpDist < acceptLevel)
-                    {
-                        sumx = dx;
-                        sumy = dy;
-                        countMin = 1;
-                        break;
-                    }
-
-                    if (tmpDist < dist)
-                    {
-                        dist = tmpDist;
-                        sumx = dx;
-                        sumy = dy;
-                        countMin = 1;
-                    }
-                    else if (tmpDist == dist)
-                    {
-                        sumx += dx;
-                        sumy += dy;
-                        countMin++;
-                    }
-                }
-            }
-
-            if (dist > escapeLevel)
-            {
-                sumx = offX;
-                sumy = offY;
-                countMin = 1;
-            }
-        }
-
-        velx(i, j) = static_cast<float>(sumx) / countMin;
-        vely(i, j) = static_cast<float>(sumy) / countMin;
-    }
-
-    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
-              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
-    {
-        bindTexture(&tex_prev, prev);
-        bindTexture(&tex_curr, curr);
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
-
-        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
-                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-/////////////////////////////////////////////////////////
-// Fast approximate version
-
-namespace optflowbm_fast
-{
-    enum
-    {
-        CTA_SIZE = 128,
-
-        TILE_COLS = 128,
-        TILE_ROWS = 32,
-
-        STRIDE = CTA_SIZE
-    };
-
-    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
-    {
-        return ::abs(a - b);
-    }
-
-    template <class T> struct FastOptFlowBM
-    {
-
-        int search_radius;
-        int block_radius;
-
-        int search_window;
-        int block_window;
-
-        PtrStepSz<T> I0;
-        PtrStep<T> I1;
-
-        mutable PtrStepi buffer;
-
-        FastOptFlowBM(int search_window_, int block_window_,
-                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
-                      PtrStepi buffer_) :
-            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
-            search_window(search_window_), block_window(block_window_),
-            I0(I0_), I1(I1_),
-            buffer(buffer_)
-        {
-        }
-
-        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                dist_sums[index] = 0;
-
-                for (int tx = 0; tx < block_window; ++tx)
-                    col_sums(tx, index) = 0;
-
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int ay = i;
-                int ax = j;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius;
-
-                for (int tx = -block_radius; tx <= block_radius; ++tx)
-                {
-                    int col_sum = 0;
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                    {
-                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
-
-                        dist_sums[index] += dist;
-                        col_sum += dist;
-                    }
-
-                    col_sums(tx + block_radius, index) = col_sum;
-                }
-
-                up_col_sums(j, index) = col_sums(block_window - 1, index);
-            }
-        }
-
-        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int ay = i;
-                int ax = j + block_radius;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius + block_radius;
-
-                int col_sum = 0;
-
-                for (int ty = -block_radius; ty <= block_radius; ++ty)
-                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
-
-                dist_sums[index] += col_sum - col_sums(first, index);
-
-                col_sums(first, index) = col_sum;
-                up_col_sums(j, index) = col_sum;
-            }
-        }
-
-        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            int ay = i;
-            int ax = j + block_radius;
-
-            T a_up   = I0(ay - block_radius - 1, ax);
-            T a_down = I0(ay + block_radius, ax);
-
-            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius + block_radius;
-
-                T b_up   = I1(by - block_radius - 1, bx);
-                T b_down = I1(by + block_radius, bx);
-
-                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-
-                dist_sums[index] += col_sum  - col_sums(first, index);
-                col_sums(first, index) = col_sum;
-                up_col_sums(j, index) = col_sum;
-            }
-        }
-
-        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
-        {
-            int bestDist = numeric_limits<int>::max();
-            int bestInd = -1;
-
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int curDist = dist_sums[index];
-                if (curDist < bestDist)
-                {
-                    bestDist = curDist;
-                    bestInd = index;
-                }
-            }
-
-            __shared__ int cta_dist_buffer[CTA_SIZE];
-            __shared__ int cta_ind_buffer[CTA_SIZE];
-
-            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
-
-            if (threadIdx.x == 0)
-            {
-                int y = bestInd / search_window;
-                int x = bestInd - y * search_window;
-
-                velx = x - search_radius;
-                vely = y - search_radius;
-            }
-        }
-
-        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
-        {
-            int tbx = blockIdx.x * TILE_COLS;
-            int tby = blockIdx.y * TILE_ROWS;
-
-            int tex = ::min(tbx + TILE_COLS, I0.cols);
-            int tey = ::min(tby + TILE_ROWS, I0.rows);
-
-            PtrStepi col_sums;
-            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
-            col_sums.step = buffer.step;
-
-            PtrStepi up_col_sums;
-            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
-            up_col_sums.step = buffer.step;
-
-            extern __shared__ int dist_sums[]; //search_window * search_window
-
-            int first = 0;
-
-            for (int i = tby; i < tey; ++i)
-            {
-                for (int j = tbx; j < tex; ++j)
-                {
-                    __syncthreads();
-
-                    if (j == tbx)
-                    {
-                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
-                        first = 0;
-                    }
-                    else
-                    {
-                        if (i == tby)
-                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
-                        else
-                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
-
-                        first = (first + 1) % block_window;
-                    }
-
-                    __syncthreads();
-
-                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
-                }
-            }
-        }
-
-    };
-
-    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
-    {
-        fbm(velx, vely);
-    }
-
-    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-    {
-        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
-
-        buffer_cols = search_window * search_window * grid.y;
-        buffer_rows = src_cols + block_window * grid.x;
-    }
-
-    template <typename T>
-    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
-    {
-        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
-
-        dim3 block(CTA_SIZE, 1);
-        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
-
-        size_t smem = search_window * search_window * sizeof(int);
-
-        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
-        cudaSafeCall ( cudaGetLastError () );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
-}
-
-#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -1,220 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace optical_flow
-    {
-        #define NEEDLE_MAP_SCALE 16
-        #define NUM_VERTS_PER_ARROW 6
-
-        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {
-            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
-
-            volatile float* u_col_sum = smem;
-            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
-
-            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
-            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
-
-            u_col_sum[threadIdx.x] = 0;
-            v_col_sum[threadIdx.x] = 0;
-
-            #pragma unroll
-            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
-            {
-                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
-                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
-            }
-
-            if (threadIdx.x < 8)
-            {
-                // now add the column sums
-                const uint X = threadIdx.x;
-
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
-                }
-
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
-                }
-
-                if (X | 0xf8 == 0xf8)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
-                }
-
-                if (X == 0)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
-                }
-            }
-
-            if (threadIdx.x == 0)
-            {
-                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
-
-                u_col_sum[0] *= coeff;
-                v_col_sum[0] *= coeff;
-
-                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
-                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
-            }
-        }
-
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
-        {
-            const dim3 block(NEEDLE_MAP_SCALE);
-            const dim3 grid(u_avg.cols, u_avg.rows);
-
-            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            // test - just draw a triangle at each pixel
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-
-            float3 v[NUM_VERTS_PER_ARROW];
-
-            if (x < u_avg.cols && y < u_avg.rows)
-            {
-                const float u_avg_val = u_avg(y, x);
-                const float v_avg_val = v_avg(y, x);
-
-                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
-
-                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
-                r = fmin(14.0f * (r / max_flow), 14.0f);
-
-                v[0].z = 1.0f;
-                v[1].z = 0.7f;
-                v[2].z = 0.7f;
-                v[3].z = 0.7f;
-                v[4].z = 0.7f;
-                v[5].z = 1.0f;
-
-                v[0].x = arrow_x;
-                v[0].y = arrow_y;
-                v[5].x = arrow_x;
-                v[5].y = arrow_y;
-
-                v[2].x = arrow_x + r * ::cosf(theta);
-                v[2].y = arrow_y + r * ::sinf(theta);
-                v[3].x = v[2].x;
-                v[3].y = v[2].y;
-
-                r = ::fmin(r, 2.5f);
-
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
-
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
-
-                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[0].x * xscale;
-                vertex_data[indx++] = v[0].y * yscale;
-                vertex_data[indx++] = v[0].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[1].x * xscale;
-                vertex_data[indx++] = v[1].y * yscale;
-                vertex_data[indx++] = v[1].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[2].x * xscale;
-                vertex_data[indx++] = v[2].y * yscale;
-                vertex_data[indx++] = v[2].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[3].x * xscale;
-                vertex_data[indx++] = v[3].y * yscale;
-                vertex_data[indx++] = v[3].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[4].x * xscale;
-                vertex_data[indx++] = v[4].y * yscale;
-                vertex_data[indx++] = v[4].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[5].x * xscale;
-                vertex_data[indx++] = v[5].y * yscale;
-                vertex_data[indx++] = v[5].z;
-            }
-        }
-
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            const dim3 block(16);
-            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
-
-            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -1,647 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-#define tx threadIdx.x
-#define ty threadIdx.y
-#define bx blockIdx.x
-#define by blockIdx.y
-#define bdx blockDim.x
-#define bdy blockDim.y
-
-#define BORDER_SIZE 5
-#define MAX_KSIZE_HALF 100
-
-namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-{
-    __constant__ float c_g[8];
-    __constant__ float c_xg[8];
-    __constant__ float c_xxg[8];
-    __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
-
-
-    template <int polyN>
-    __global__ void polynomialExpansion(
-            const int height, const int width, const PtrStepf src, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * (bdx - 2*polyN) + tx - polyN;
-
-        if (y < height)
-        {
-            extern __shared__ float smem[];
-            volatile float *row = smem + tx;
-            int xWarped = ::min(::max(x, 0), width - 1);
-
-            row[0] = src(y, xWarped) * c_g[0];
-            row[bdx] = 0.f;
-            row[2*bdx] = 0.f;
-
-            for (int k = 1; k <= polyN; ++k)
-            {
-                float t0 = src(::max(y - k, 0), xWarped);
-                float t1 = src(::min(y + k, height - 1), xWarped);
-
-                row[0] += c_g[k] * (t0 + t1);
-                row[bdx] += c_xg[k] * (t1 - t0);
-                row[2*bdx] += c_xxg[k] * (t0 + t1);
-            }
-
-            __syncthreads();
-
-            if (tx >= polyN && tx + polyN < bdx && x < width)
-            {
-                float b1 = c_g[0] * row[0];
-                float b3 = c_g[0] * row[bdx];
-                float b5 = c_g[0] * row[2*bdx];
-                float b2 = 0, b4 = 0, b6 = 0;
-
-                for (int k = 1; k <= polyN; ++k)
-                {
-                    b1 += (row[k] + row[-k]) * c_g[k];
-                    b4 += (row[k] + row[-k]) * c_xxg[k];
-                    b2 += (row[k] - row[-k]) * c_xg[k];
-                    b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
-                    b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
-                    b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
-                }
-
-                dst(y, xWarped) = b3*c_ig11;
-                dst(height + y, xWarped) = b2*c_ig11;
-                dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
-                dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
-                dst(4*height + y, xWarped) = b6*c_ig55;
-            }
-        }
-    }
-
-
-    void setPolynomialExpansionConsts(
-            int polyN, const float *g, const float *xg, const float *xxg,
-            float ig11, float ig03, float ig33, float ig55)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
-    }
-
-
-    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
-    {
-        dim3 block(256);
-        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
-        int smem = 3 * block.x * sizeof(float);
-
-        if (polyN == 5)
-            polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
-        else if (polyN == 7)
-            polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __constant__ float c_border[BORDER_SIZE + 1];
-
-    __global__ void updateMatrices(
-            const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
-            const PtrStepf R0, const PtrStepf R1, PtrStepf M)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        if (y < height && x < width)
-        {
-            float dx = flowx(y, x);
-            float dy = flowy(y, x);
-            float fx = x + dx;
-            float fy = y + dy;
-
-            int x1 = floorf(fx);
-            int y1 = floorf(fy);
-            fx -= x1; fy -= y1;
-
-            float r2, r3, r4, r5, r6;
-
-            if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
-            {
-                float a00 = (1.f - fx) * (1.f - fy);
-                float a01 = fx * (1.f - fy);
-                float a10 = (1.f - fx) * fy;
-                float a11 = fx * fy;
-
-                r2 = a00 * R1(y1, x1) +
-                     a01 * R1(y1, x1 + 1) +
-                     a10 * R1(y1 + 1, x1) +
-                     a11 * R1(y1 + 1, x1 + 1);
-
-                r3 = a00 * R1(height + y1, x1) +
-                     a01 * R1(height + y1, x1 + 1) +
-                     a10 * R1(height + y1 + 1, x1) +
-                     a11 * R1(height + y1 + 1, x1 + 1);
-
-                r4 = a00 * R1(2*height + y1, x1) +
-                     a01 * R1(2*height + y1, x1 + 1) +
-                     a10 * R1(2*height + y1 + 1, x1) +
-                     a11 * R1(2*height + y1 + 1, x1 + 1);
-
-                r5 = a00 * R1(3*height + y1, x1) +
-                     a01 * R1(3*height + y1, x1 + 1) +
-                     a10 * R1(3*height + y1 + 1, x1) +
-                     a11 * R1(3*height + y1 + 1, x1 + 1);
-
-                r6 = a00 * R1(4*height + y1, x1) +
-                     a01 * R1(4*height + y1, x1 + 1) +
-                     a10 * R1(4*height + y1 + 1, x1) +
-                     a11 * R1(4*height + y1 + 1, x1 + 1);
-
-                r4 = (R0(2*height + y, x) + r4) * 0.5f;
-                r5 = (R0(3*height + y, x) + r5) * 0.5f;
-                r6 = (R0(4*height + y, x) + r6) * 0.25f;
-            }
-            else
-            {
-                r2 = r3 = 0.f;
-                r4 = R0(2*height + y, x);
-                r5 = R0(3*height + y, x);
-                r6 = R0(4*height + y, x) * 0.5f;
-            }
-
-            r2 = (R0(y, x) - r2) * 0.5f;
-            r3 = (R0(height + y, x) - r3) * 0.5f;
-
-            r2 += r4*dy + r6*dx;
-            r3 += r6*dy + r5*dx;
-
-            float scale =
-                    c_border[::min(x, BORDER_SIZE)] *
-                    c_border[::min(y, BORDER_SIZE)] *
-                    c_border[::min(width - x - 1, BORDER_SIZE)] *
-                    c_border[::min(height - y - 1, BORDER_SIZE)];
-
-            r2 *= scale; r3 *= scale; r4 *= scale;
-            r5 *= scale; r6 *= scale;
-
-            M(y, x) = r4*r4 + r6*r6;
-            M(height + y, x) = (r4 + r5)*r6;
-            M(2*height + y, x) = r5*r5 + r6*r6;
-            M(3*height + y, x) = r4*r2 + r6*r3;
-            M(4*height + y, x) = r6*r2 + r5*r3;
-        }
-    }
-
-
-    void setUpdateMatricesConsts()
-    {
-        static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
-        cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
-    }
-
-
-    void updateMatricesGpu(
-            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
-            PtrStepSzf M, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
-
-        updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __global__ void updateFlow(
-            const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        if (y < height && x < width)
-        {
-            float g11 = M(y, x);
-            float g12 = M(height + y, x);
-            float g22 = M(2*height + y, x);
-            float h1 = M(3*height + y, x);
-            float h2 = M(4*height + y, x);
-
-            float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
-
-            flowx(y, x) = (g11*h2 - g12*h1) * detInv;
-            flowy(y, x) = (g22*h1 - g12*h2) * detInv;
-        }
-    }
-
-
-    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
-
-        updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    /*__global__ void boxFilter(
-            const int height, const int width, const PtrStepf src,
-            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = ::min(::max(xExt, 0), width - 1);
-
-                row[i] = src(y, xExt);
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal passs
-                row += tx + ksizeHalf;
-                float res = row[0];
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    res += row[-i] + row[i];
-                dst(y, x) = res * boxAreaInv;
-            }
-        }
-    }
-
-
-    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        dim3 block(256);
-        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }*/
-
-
-    __global__ void boxFilter5(
-            const int height, const int width, const PtrStepf src,
-            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-
-        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-        volatile float *row = smem + 5 * ty * smw;
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = ::min(::max(xExt, 0), width - 1);
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] = src(k*height + y, xExt);
-
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        row[k*smw + i] +=
-                                src(k*height + ::max(y - j, 0), xExt) +
-                                src(k*height + ::min(y + j, height - 1), xExt);
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal passs
-
-                row += tx + ksizeHalf;
-                float res[5];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    res[k] = row[k*smw];
-
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        res[k] += row[k*smw - i] + row[k*smw + i];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    dst(k*height + y, x) = res[k] * boxAreaInv;
-            }
-        }
-    }
-
-
-    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(256);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(128);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
-
-    template <typename Border>
-    __global__ void gaussianBlur(
-            const int height, const int width, const PtrStepf src, const int ksizeHalf,
-            const Border b, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = b.idx_col(xExt);
-                row[i] = src(y, xExt) * c_gKer[0];
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    row[i] +=
-                            (src(b.idx_row_low(y - j), xExt) +
-                             src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal pass
-                row += tx + ksizeHalf;
-                float res = row[0] * c_gKer[0];
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    res += (row[-i] + row[i]) * c_gKer[i];
-                dst(y, x) = res;
-            }
-        }
-    }
-
-
-    void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
-    }
-
-
-    template <typename Border>
-    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows;
-        int width = src.cols;
-
-        dim3 block(256);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
-        Border b(height, width);
-
-        gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void gaussianBlurGpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlurCaller<BrdReflect101<float> >,
-            gaussianBlurCaller<BrdReplicate<float> >,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-
-    template <typename Border>
-    __global__ void gaussianBlur5(
-            const int height, const int width, const PtrStepf src, const int ksizeHalf,
-            const Border b, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-
-        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-        volatile float *row = smem + 5 * ty * smw;
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = b.idx_col(xExt);
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
-
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        row[k*smw + i] +=
-                                (src(k*height + b.idx_row_low(y - j), xExt) +
-                                 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal pass
-
-                row += tx + ksizeHalf;
-                float res[5];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    res[k] = row[k*smw] * c_gKer[0];
-
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    dst(k*height + y, x) = res[k];
-            }
-        }
-    }
-
-
-    template <typename Border, int blockDimX>
-    void gaussianBlur5Caller(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(blockDimX);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-        Border b(height, width);
-
-        gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void gaussianBlur5Gpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlur5Caller<BrdReflect101<float>,256>,
-            gaussianBlur5Caller<BrdReplicate<float>,256>,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-    void gaussianBlur5Gpu_CC11(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlur5Caller<BrdReflect101<float>,128>,
-            gaussianBlur5Caller<BrdReplicate<float>,128>,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -1,560 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace pyrlk
-{
-    __constant__ int c_winSize_x;
-    __constant__ int c_winSize_y;
-    __constant__ int c_halfWin_x;
-    __constant__ int c_halfWin_y;
-    __constant__ int c_iters;
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    template <int cn> struct Tex_I;
-    template <> struct Tex_I<1>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_If, x, y);
-        }
-    };
-    template <> struct Tex_I<4>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_If4, x, y);
-        }
-    };
-
-    template <int cn> struct Tex_J;
-    template <> struct Tex_J<1>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_Jf, x, y);
-        }
-    };
-    template <> struct Tex_J<4>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_Jf4, x, y);
-        }
-    };
-
-    __device__ __forceinline__ void accum(float& dst, float val)
-    {
-        dst += val;
-    }
-    __device__ __forceinline__ void accum(float& dst, const float4& val)
-    {
-        dst += val.x + val.y + val.z;
-    }
-
-    __device__ __forceinline__ float abs_(float a)
-    {
-        return ::fabsf(a);
-    }
-    __device__ __forceinline__ float4 abs_(const float4& a)
-    {
-        return abs(a);
-    }
-
-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
-    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
-    {
-    #if __CUDA_ARCH__ <= 110
-        const int BLOCK_SIZE = 128;
-    #else
-        const int BLOCK_SIZE = 256;
-    #endif
-
-        __shared__ float smem1[BLOCK_SIZE];
-        __shared__ float smem2[BLOCK_SIZE];
-        __shared__ float smem3[BLOCK_SIZE];
-
-        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        float2 prevPt = prevPts[blockIdx.x];
-        prevPt.x *= (1.0f / (1 << level));
-        prevPt.y *= (1.0f / (1 << level));
-
-        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
-        {
-            if (tid == 0 && level == 0)
-                status[blockIdx.x] = 0;
-
-            return;
-        }
-
-        prevPt.x -= c_halfWin_x;
-        prevPt.y -= c_halfWin_y;
-
-        // extract the patch from the first image, compute covariation matrix of derivatives
-
-        float A11 = 0;
-        float A12 = 0;
-        float A22 = 0;
-
-        typedef typename TypeVec<float, cn>::vec_type work_type;
-
-        work_type I_patch   [PATCH_Y][PATCH_X];
-        work_type dIdx_patch[PATCH_Y][PATCH_X];
-        work_type dIdy_patch[PATCH_Y][PATCH_X];
-
-        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
-        {
-            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
-            {
-                float x = prevPt.x + xBase + 0.5f;
-                float y = prevPt.y + yBase + 0.5f;
-
-                I_patch[i][j] = Tex_I<cn>::read(x, y);
-
-                // Sharr Deriv
-
-                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
-
-                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
-
-                dIdx_patch[i][j] = dIdx;
-                dIdy_patch[i][j] = dIdy;
-
-                accum(A11, dIdx * dIdx);
-                accum(A12, dIdx * dIdy);
-                accum(A22, dIdy * dIdy);
-            }
-        }
-
-        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
-
-    #if __CUDA_ARCH__ >= 300
-        if (tid == 0)
-        {
-            smem1[0] = A11;
-            smem2[0] = A12;
-            smem3[0] = A22;
-        }
-    #endif
-
-        __syncthreads();
-
-        A11 = smem1[0];
-        A12 = smem2[0];
-        A22 = smem3[0];
-
-        float D = A11 * A22 - A12 * A12;
-
-        if (D < numeric_limits<float>::epsilon())
-        {
-            if (tid == 0 && level == 0)
-                status[blockIdx.x] = 0;
-
-            return;
-        }
-
-        D = 1.f / D;
-
-        A11 *= D;
-        A12 *= D;
-        A22 *= D;
-
-        float2 nextPt = nextPts[blockIdx.x];
-        nextPt.x *= 2.f;
-        nextPt.y *= 2.f;
-
-        nextPt.x -= c_halfWin_x;
-        nextPt.y -= c_halfWin_y;
-
-        for (int k = 0; k < c_iters; ++k)
-        {
-            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
-            {
-                if (tid == 0 && level == 0)
-                    status[blockIdx.x] = 0;
-
-                return;
-            }
-
-            float b1 = 0;
-            float b2 = 0;
-
-            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-            {
-                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                {
-                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                    work_type diff = (J_val - I_val) * 32.0f;
-
-                    accum(b1, diff * dIdx_patch[i][j]);
-                    accum(b2, diff * dIdy_patch[i][j]);
-                }
-            }
-
-            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
-
-        #if __CUDA_ARCH__ >= 300
-            if (tid == 0)
-            {
-                smem1[0] = b1;
-                smem2[0] = b2;
-            }
-        #endif
-
-            __syncthreads();
-
-            b1 = smem1[0];
-            b2 = smem2[0];
-
-            float2 delta;
-            delta.x = A12 * b2 - A22 * b1;
-            delta.y = A12 * b1 - A11 * b2;
-
-            nextPt.x += delta.x;
-            nextPt.y += delta.y;
-
-            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                break;
-        }
-
-        float errval = 0;
-        if (calcErr)
-        {
-            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-            {
-                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                {
-                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                    work_type diff = J_val - I_val;
-
-                    accum(errval, abs_(diff));
-                }
-            }
-
-            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
-        }
-
-        if (tid == 0)
-        {
-            nextPt.x += c_halfWin_x;
-            nextPt.y += c_halfWin_y;
-
-            nextPts[blockIdx.x] = nextPt;
-
-            if (calcErr)
-                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
-        }
-    }
-
-    template <int cn, int PATCH_X, int PATCH_Y>
-    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                       int level, dim3 block, cudaStream_t stream)
-    {
-        dim3 grid(ptcount);
-
-        if (level == 0 && err)
-            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-        else
-            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <bool calcErr>
-    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
-    {
-        extern __shared__ int smem[];
-
-        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
-        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
-
-        int* I_patch = smem;
-        int* dIdx_patch = I_patch + patchWidth * patchHeight;
-        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
-
-        const int xBase = blockIdx.x * blockDim.x;
-        const int yBase = blockIdx.y * blockDim.y;
-
-        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
-        {
-            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
-            {
-                float x = xBase - c_halfWin_x + j + 0.5f;
-                float y = yBase - c_halfWin_y + i + 0.5f;
-
-                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
-
-                // Sharr Deriv
-
-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
-
-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
-            }
-        }
-
-        __syncthreads();
-
-        const int x = xBase + threadIdx.x;
-        const int y = yBase + threadIdx.y;
-
-        if (x >= cols || y >= rows)
-            return;
-
-        int A11i = 0;
-        int A12i = 0;
-        int A22i = 0;
-
-        for (int i = 0; i < c_winSize_y; ++i)
-        {
-            for (int j = 0; j < c_winSize_x; ++j)
-            {
-                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                A11i += dIdx * dIdx;
-                A12i += dIdx * dIdy;
-                A22i += dIdy * dIdy;
-            }
-        }
-
-        float A11 = A11i;
-        float A12 = A12i;
-        float A22 = A22i;
-
-        float D = A11 * A22 - A12 * A12;
-
-        if (D < numeric_limits<float>::epsilon())
-        {
-            if (calcErr)
-                err(y, x) = numeric_limits<float>::max();
-
-            return;
-        }
-
-        D = 1.f / D;
-
-        A11 *= D;
-        A12 *= D;
-        A22 *= D;
-
-        float2 nextPt;
-        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
-        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
-
-        for (int k = 0; k < c_iters; ++k)
-        {
-            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
-            {
-                if (calcErr)
-                    err(y, x) = numeric_limits<float>::max();
-
-                return;
-            }
-
-            int b1 = 0;
-            int b2 = 0;
-
-            for (int i = 0; i < c_winSize_y; ++i)
-            {
-                for (int j = 0; j < c_winSize_x; ++j)
-                {
-                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                    int diff = (J - I) * 32;
-
-                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                    b1 += diff * dIdx;
-                    b2 += diff * dIdy;
-                }
-            }
-
-            float2 delta;
-            delta.x = A12 * b2 - A22 * b1;
-            delta.y = A12 * b1 - A11 * b2;
-
-            nextPt.x += delta.x;
-            nextPt.y += delta.y;
-
-            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                break;
-        }
-
-        u(y, x) = nextPt.x - x;
-        v(y, x) = nextPt.y - y;
-
-        if (calcErr)
-        {
-            int errval = 0;
-
-            for (int i = 0; i < c_winSize_y; ++i)
-            {
-                for (int j = 0; j < c_winSize_x; ++j)
-                {
-                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                    errval += ::abs(J - I);
-                }
-            }
-
-            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
-        }
-    }
-
-    void loadConstants(int2 winSize, int iters)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
-
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
-    }
-
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
-
-        static const func_t funcs[5][5] =
-        {
-            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
-            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
-            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
-            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
-            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
-        };
-
-        bindTexture(&tex_If, I);
-        bindTexture(&tex_Jf, J);
-
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-            level, block, stream);
-    }
-
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
-
-        static const func_t funcs[5][5] =
-        {
-            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
-            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
-            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
-            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
-            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
-        };
-
-        bindTexture(&tex_If4, I);
-        bindTexture(&tex_Jf4, J);
-
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-            level, block, stream);
-    }
-
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
-    {
-        dim3 block(16, 16);
-        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-
-        bindTexture(&tex_Ib, I);
-        bindTexture(&tex_Jf, J);
-
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        const int patchWidth  = block.x + 2 * halfWin.x;
-        const int patchHeight = block.y + 2 * halfWin.y;
-        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
-
-        if (err.data)
-        {
-            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-        else
-        {
-            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/tvl1flow.cu
+++ b/modules/gpu/src/cuda/tvl1flow.cu
@@ -1,332 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-////////////////////////////////////////////////////////////
-// centeredGradient
-
-namespace tvl1flow
-{
-    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= src.cols || y >= src.rows)
-            return;
-
-        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
-        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
-    }
-
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// warpBackward
-
-namespace tvl1flow
-{
-    static __device__ __forceinline__ float bicubicCoeff(float x_)
-    {
-        float x = fabsf(x_);
-        if (x <= 1.0f)
-        {
-            return x * x * (1.5f * x - 2.5f) + 1.0f;
-        }
-        else if (x < 2.0f)
-        {
-            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-        }
-        else
-        {
-            return 0.0f;
-        }
-    }
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= I0.cols || y >= I0.rows)
-            return;
-
-        const float u1Val = u1(y, x);
-        const float u2Val = u2(y, x);
-
-        const float wx = x + u1Val;
-        const float wy = y + u2Val;
-
-        const int xmin = ::ceilf(wx - 2.0f);
-        const int xmax = ::floorf(wx + 2.0f);
-
-        const int ymin = ::ceilf(wy - 2.0f);
-        const int ymax = ::floorf(wy + 2.0f);
-
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                sum  += w * tex2D(tex_I1 , cx, cy);
-                sumx += w * tex2D(tex_I1x, cx, cy);
-                sumy += w * tex2D(tex_I1y, cx, cy);
-
-                wsum += w;
-            }
-        }
-
-        const float coeff = 1.0f / wsum;
-
-        const float I1wVal  = sum  * coeff;
-        const float I1wxVal = sumx * coeff;
-        const float I1wyVal = sumy * coeff;
-
-        I1w(y, x)  = I1wVal;
-        I1wx(y, x) = I1wxVal;
-        I1wy(y, x) = I1wyVal;
-
-        const float Ix2 = I1wxVal * I1wxVal;
-        const float Iy2 = I1wyVal * I1wyVal;
-
-        // store the |Grad(I1)|^2
-        grad(y, x) = Ix2 + Iy2;
-
-        // compute the constant part of the rho function
-        const float I0Val = I0(y, x);
-        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
-
-        bindTexture(&tex_I1 , I1);
-        bindTexture(&tex_I1x, I1x);
-        bindTexture(&tex_I1y, I1y);
-
-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// estimateU
-
-namespace tvl1flow
-{
-    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
-    {
-        if (x > 0 && y > 0)
-        {
-            const float v1x = v1(y, x) - v1(y, x - 1);
-            const float v2y = v2(y, x) - v2(y - 1, x);
-            return v1x + v2y;
-        }
-        else
-        {
-            if (y > 0)
-                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
-            else
-            {
-                if (x > 0)
-                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
-                else
-                    return v1(0, 0) + v2(0, 0);
-            }
-        }
-    }
-
-    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
-                              const PtrStepf grad, const PtrStepf rho_c,
-                              const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
-                              PtrStepf u1, PtrStepf u2, PtrStepf error,
-                              const float l_t, const float theta)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= I1wx.cols || y >= I1wx.rows)
-            return;
-
-        const float I1wxVal = I1wx(y, x);
-        const float I1wyVal = I1wy(y, x);
-        const float gradVal = grad(y, x);
-        const float u1OldVal = u1(y, x);
-        const float u2OldVal = u2(y, x);
-
-        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
-
-        // estimate the values of the variable (v1, v2) (thresholding operator TH)
-
-        float d1 = 0.0f;
-        float d2 = 0.0f;
-
-        if (rho < -l_t * gradVal)
-        {
-            d1 = l_t * I1wxVal;
-            d2 = l_t * I1wyVal;
-        }
-        else if (rho > l_t * gradVal)
-        {
-            d1 = -l_t * I1wxVal;
-            d2 = -l_t * I1wyVal;
-        }
-        else if (gradVal > numeric_limits<float>::epsilon())
-        {
-            const float fi = -rho / gradVal;
-            d1 = fi * I1wxVal;
-            d2 = fi * I1wyVal;
-        }
-
-        const float v1 = u1OldVal + d1;
-        const float v2 = u2OldVal + d2;
-
-        // compute the divergence of the dual variable (p1, p2)
-
-        const float div_p1 = divergence(p11, p12, y, x);
-        const float div_p2 = divergence(p21, p22, y, x);
-
-        // estimate the values of the optical flow (u1, u2)
-
-        const float u1NewVal = v1 + theta * div_p1;
-        const float u2NewVal = v2 + theta * div_p2;
-
-        u1(y, x) = u1NewVal;
-        u2(y, x) = u2NewVal;
-
-        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
-        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
-        error(y, x) = n1 + n2;
-    }
-
-    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
-                   PtrStepSzf grad, PtrStepSzf rho_c,
-                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
-                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
-                   float l_t, float theta)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
-
-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// estimateDualVariables
-
-namespace tvl1flow
-{
-    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= u1.cols || y >= u1.rows)
-            return;
-
-        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
-        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
-
-        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
-        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
-
-        const float g1 = ::hypotf(u1x, u1y);
-        const float g2 = ::hypotf(u2x, u2y);
-
-        const float ng1 = 1.0f + taut * g1;
-        const float ng2 = 1.0f + taut * g2;
-
-        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
-        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
-        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
-        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
-    }
-
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
-
-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@@ -1,753 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-class cv::gpu::FGDStatModel::Impl
-{
-};
-
-cv::gpu::FGDStatModel::Params::Params() { throw_no_cuda(); }
-
-cv::gpu::FGDStatModel::FGDStatModel(int) { throw_no_cuda(); }
-cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat&, const Params&, int) { throw_no_cuda(); }
-cv::gpu::FGDStatModel::~FGDStatModel() {}
-void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat&, const Params&) { throw_no_cuda(); }
-void cv::gpu::FGDStatModel::release() {}
-int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat&) { throw_no_cuda(); return 0; }
-
-#else
-
-#include "fgd_bgfg_common.hpp"
-#include "opencv2/imgproc/imgproc_c.h"
-
-namespace
-{
-    class BGPixelStat
-    {
-    public:
-        void create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn);
-        void release();
-
-        void setTrained();
-
-        operator bgfg::BGPixelStat();
-
-    private:
-        cv::gpu::GpuMat Pbc_;
-        cv::gpu::GpuMat Pbcc_;
-        cv::gpu::GpuMat is_trained_st_model_;
-        cv::gpu::GpuMat is_trained_dyn_model_;
-
-        cv::gpu::GpuMat ctable_Pv_;
-        cv::gpu::GpuMat ctable_Pvb_;
-        cv::gpu::GpuMat ctable_v_;
-
-        cv::gpu::GpuMat cctable_Pv_;
-        cv::gpu::GpuMat cctable_Pvb_;
-        cv::gpu::GpuMat cctable_v1_;
-        cv::gpu::GpuMat cctable_v2_;
-    };
-
-    void BGPixelStat::create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn)
-    {
-        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbc_);
-        Pbc_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbcc_);
-        Pbcc_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_st_model_);
-        is_trained_st_model_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_dyn_model_);
-        is_trained_dyn_model_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pv_);
-        ctable_Pv_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pvb_);
-        ctable_Pvb_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_8UC(out_cn), ctable_v_);
-        ctable_v_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pv_);
-        cctable_Pv_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pvb_);
-        cctable_Pvb_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v1_);
-        cctable_v1_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v2_);
-        cctable_v2_.setTo(cv::Scalar::all(0));
-    }
-
-    void BGPixelStat::release()
-    {
-        Pbc_.release();
-        Pbcc_.release();
-        is_trained_st_model_.release();
-        is_trained_dyn_model_.release();
-
-        ctable_Pv_.release();
-        ctable_Pvb_.release();
-        ctable_v_.release();
-
-        cctable_Pv_.release();
-        cctable_Pvb_.release();
-        cctable_v1_.release();
-        cctable_v2_.release();
-    }
-
-    void BGPixelStat::setTrained()
-    {
-        is_trained_st_model_.setTo(cv::Scalar::all(1));
-        is_trained_dyn_model_.setTo(cv::Scalar::all(1));
-    }
-
-    BGPixelStat::operator bgfg::BGPixelStat()
-    {
-        bgfg::BGPixelStat stat;
-
-        stat.rows_ = Pbc_.rows;
-
-        stat.Pbc_data_ = Pbc_.data;
-        stat.Pbc_step_ = Pbc_.step;
-
-        stat.Pbcc_data_ = Pbcc_.data;
-        stat.Pbcc_step_ = Pbcc_.step;
-
-        stat.is_trained_st_model_data_ = is_trained_st_model_.data;
-        stat.is_trained_st_model_step_ = is_trained_st_model_.step;
-
-        stat.is_trained_dyn_model_data_ = is_trained_dyn_model_.data;
-        stat.is_trained_dyn_model_step_ = is_trained_dyn_model_.step;
-
-        stat.ctable_Pv_data_ = ctable_Pv_.data;
-        stat.ctable_Pv_step_ = ctable_Pv_.step;
-
-        stat.ctable_Pvb_data_ = ctable_Pvb_.data;
-        stat.ctable_Pvb_step_ = ctable_Pvb_.step;
-
-        stat.ctable_v_data_ = ctable_v_.data;
-        stat.ctable_v_step_ = ctable_v_.step;
-
-        stat.cctable_Pv_data_ = cctable_Pv_.data;
-        stat.cctable_Pv_step_ = cctable_Pv_.step;
-
-        stat.cctable_Pvb_data_ = cctable_Pvb_.data;
-        stat.cctable_Pvb_step_ = cctable_Pvb_.step;
-
-        stat.cctable_v1_data_ = cctable_v1_.data;
-        stat.cctable_v1_step_ = cctable_v1_.step;
-
-        stat.cctable_v2_data_ = cctable_v2_.data;
-        stat.cctable_v2_step_ = cctable_v2_.step;
-
-        return stat;
-    }
-}
-
-class cv::gpu::FGDStatModel::Impl
-{
-public:
-    Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn);
-    ~Impl();
-
-    void create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params);
-    void release();
-
-    int update(const cv::gpu::GpuMat& curFrame);
-
-private:
-    Impl(const Impl&);
-    Impl& operator=(const Impl&);
-
-    int out_cn_;
-
-    cv::gpu::FGDStatModel::Params params_;
-
-    cv::gpu::GpuMat& background_;
-    cv::gpu::GpuMat& foreground_;
-    std::vector< std::vector<cv::Point> >& foreground_regions_;
-
-    cv::Mat h_foreground_;
-
-    cv::gpu::GpuMat prevFrame_;
-    cv::gpu::GpuMat Ftd_;
-    cv::gpu::GpuMat Fbd_;
-    BGPixelStat stat_;
-
-    cv::gpu::GpuMat hist_;
-    cv::gpu::GpuMat histBuf_;
-
-    cv::gpu::GpuMat countBuf_;
-
-    cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
-    cv::gpu::GpuMat filterBrd_;
-
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
-
-    CvMemStorage* storage_;
-};
-
-cv::gpu::FGDStatModel::Impl::Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn) :
-    out_cn_(out_cn), background_(background), foreground_(foreground), foreground_regions_(foreground_regions)
-{
-    CV_Assert( out_cn_ == 3 || out_cn_ == 4 );
-
-    storage_ = cvCreateMemStorage();
-    CV_Assert( storage_ != 0 );
-}
-
-cv::gpu::FGDStatModel::Impl::~Impl()
-{
-    cvReleaseMemStorage(&storage_);
-}
-
-namespace
-{
-    void copyChannels(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, int dst_cn = -1)
-    {
-        const int src_cn = src.channels();
-
-        if (dst_cn < 0)
-            dst_cn = src_cn;
-
-        cv::gpu::ensureSizeIsEnough(src.size(), CV_MAKE_TYPE(src.depth(), dst_cn), dst);
-
-        if (src_cn == dst_cn)
-            src.copyTo(dst);
-        else
-        {
-            static const int cvt_codes[4][4] =
-            {
-                {-1, -1, cv::COLOR_GRAY2BGR, cv::COLOR_GRAY2BGRA},
-                {-1, -1, -1, -1},
-                {cv::COLOR_BGR2GRAY, -1, -1, cv::COLOR_BGR2BGRA},
-                {cv::COLOR_BGRA2GRAY, -1, cv::COLOR_BGRA2BGR, -1}
-            };
-
-            const int cvt_code = cvt_codes[src_cn - 1][dst_cn - 1];
-            CV_DbgAssert( cvt_code >= 0 );
-
-            cv::gpu::cvtColor(src, dst, cvt_code, dst_cn);
-        }
-    }
-}
-
-void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params)
-{
-    CV_Assert(firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
-
-    params_ = params;
-
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, foreground_);
-
-    copyChannels(firstFrame, background_, out_cn_);
-
-    copyChannels(firstFrame, prevFrame_);
-
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Ftd_);
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Fbd_);
-
-    stat_.create(firstFrame.size(), params_, out_cn_);
-    bgfg::setBGPixelStat(stat_);
-
-    if (params_.perform_morphing > 0)
-    {
-        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
-        cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
-
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
-    }
-}
-
-void cv::gpu::FGDStatModel::Impl::release()
-{
-    background_.release();
-    foreground_.release();
-
-    prevFrame_.release();
-    Ftd_.release();
-    Fbd_.release();
-    stat_.release();
-
-    hist_.release();
-    histBuf_.release();
-
-    countBuf_.release();
-
-    buf_.release();
-    filterBuf_.release();
-    filterBrd_.release();
-}
-
-/////////////////////////////////////////////////////////////////////////
-// changeDetection
-
-namespace
-{
-    void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-        static const func_t funcs[4][4] =
-        {
-            {0,0,0,0},
-            {0,0,0,0},
-            {0,0,bgfg::calcDiffHistogram_gpu<uchar3, uchar3>,bgfg::calcDiffHistogram_gpu<uchar3, uchar4>},
-            {0,0,bgfg::calcDiffHistogram_gpu<uchar4, uchar3>,bgfg::calcDiffHistogram_gpu<uchar4, uchar4>}
-        };
-
-        hist.create(3, 256, CV_32SC1);
-        histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
-                    prevFrame, curFrame,
-                    hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
-                    histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
-                    cv::gpu::deviceSupports(cv::gpu::FEATURE_SET_COMPUTE_20), 0);
-    }
-
-    void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
-    {
-        std::memset(relativeVariance, 0, 3 * bgfg::HISTOGRAM_BIN_COUNT * sizeof(double));
-
-        for (int thres = bgfg::HISTOGRAM_BIN_COUNT - 2; thres >= 0; --thres)
-        {
-            cv::Vec3d sum(0.0, 0.0, 0.0);
-            cv::Vec3d sqsum(0.0, 0.0, 0.0);
-            cv::Vec3i count(0, 0, 0);
-
-            for (int j = thres; j < bgfg::HISTOGRAM_BIN_COUNT; ++j)
-            {
-                sum[0]   += static_cast<double>(j) * hist[j];
-                sqsum[0] += static_cast<double>(j * j) * hist[j];
-                count[0] += hist[j];
-
-                sum[1]   += static_cast<double>(j) * hist[j + 256];
-                sqsum[1] += static_cast<double>(j * j) * hist[j + 256];
-                count[1] += hist[j + 256];
-
-                sum[2]   += static_cast<double>(j) * hist[j + 512];
-                sqsum[2] += static_cast<double>(j * j) * hist[j + 512];
-                count[2] += hist[j + 512];
-            }
-
-            count[0] = std::max(count[0], 1);
-            count[1] = std::max(count[1], 1);
-            count[2] = std::max(count[2], 1);
-
-            cv::Vec3d my(
-                sum[0] / count[0],
-                sum[1] / count[1],
-                sum[2] / count[2]
-            );
-
-            relativeVariance[0][thres] = std::sqrt(sqsum[0] / count[0] - my[0] * my[0]);
-            relativeVariance[1][thres] = std::sqrt(sqsum[1] / count[1] - my[1] * my[1]);
-            relativeVariance[2][thres] = std::sqrt(sqsum[2] / count[2] - my[2] * my[2]);
-        }
-    }
-
-    void calcDiffThreshMask(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::Vec3d bestThres, cv::gpu::GpuMat& changeMask)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
-        static const func_t funcs[4][4] =
-        {
-            {0,0,0,0},
-            {0,0,0,0},
-            {0,0,bgfg::calcDiffThreshMask_gpu<uchar3, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar3, uchar4>},
-            {0,0,bgfg::calcDiffThreshMask_gpu<uchar4, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar4, uchar4>}
-        };
-
-        changeMask.setTo(cv::Scalar::all(0));
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](prevFrame, curFrame, make_uchar3((uchar)bestThres[0], (uchar)bestThres[1], (uchar)bestThres[2]), changeMask, 0);
-    }
-
-    // performs change detection for Foreground detection algorithm
-    void changeDetection(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& changeMask, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
-    {
-        calcDiffHistogram(prevFrame, curFrame, hist, histBuf);
-
-        unsigned int histData[3 * 256];
-        cv::Mat h_hist(3, 256, CV_32SC1, histData);
-        hist.download(h_hist);
-
-        double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT];
-        calcRelativeVariance(histData, relativeVariance);
-
-        // Find maximum:
-        cv::Vec3d bestThres(10.0, 10.0, 10.0);
-        for (int i = 0; i < bgfg::HISTOGRAM_BIN_COUNT; ++i)
-        {
-            bestThres[0] = std::max(bestThres[0], relativeVariance[0][i]);
-            bestThres[1] = std::max(bestThres[1], relativeVariance[1][i]);
-            bestThres[2] = std::max(bestThres[2], relativeVariance[2][i]);
-        }
-
-        calcDiffThreshMask(prevFrame, curFrame, bestThres, changeMask);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// bgfgClassification
-
-namespace
-{
-    int bgfgClassification(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame,
-                           const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
-                           cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& countBuf,
-                           const cv::gpu::FGDStatModel::Params& params, int out_cn)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
-                               int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-        static const func_t funcs[4][4][4] =
-        {
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar4>},
-                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar4>}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar4>},
-                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar4>}
-            }
-        };
-
-        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
-        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][out_cn - 1](prevFrame, curFrame, Ftd, Fbd, foreground, deltaC, deltaCC, params.alpha2, params.N1c, params.N1cc, 0);
-
-        int count = cv::gpu::countNonZero(foreground, countBuf);
-
-        cv::gpu::multiply(foreground, cv::Scalar::all(255), foreground);
-
-        return count;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// smoothForeground
-
-namespace
-{
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
-    {
-        cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-
-    void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
-                          const cv::gpu::FGDStatModel::Params& params)
-    {
-        const int brd = params.perform_morphing;
-
-        const cv::Scalar erodeBrdVal = cv::Scalar::all(UCHAR_MAX);
-        const cv::Scalar dilateBrdVal = cv::Scalar::all(0);
-
-        // MORPH_OPEN
-        morphology(foreground, buf, filterBrd, brd, erodeFilter, erodeBrdVal);
-        morphology(buf, foreground, filterBrd, brd, dilateFilter, dilateBrdVal);
-
-        // MORPH_CLOSE
-        morphology(foreground, buf, filterBrd, brd, dilateFilter, dilateBrdVal);
-        morphology(buf, foreground, filterBrd, brd, erodeFilter, erodeBrdVal);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// findForegroundRegions
-
-namespace
-{
-    void seqToContours(CvSeq* _ccontours, CvMemStorage* storage, cv::OutputArrayOfArrays _contours)
-    {
-        cv::Seq<CvSeq*> all_contours(cvTreeToNodeSeq(_ccontours, sizeof(CvSeq), storage));
-
-        size_t total = all_contours.size();
-
-        _contours.create((int) total, 1, 0, -1, true);
-
-        cv::SeqIterator<CvSeq*> it = all_contours.begin();
-        for (size_t i = 0; i < total; ++i, ++it)
-        {
-            CvSeq* c = *it;
-            ((CvContour*)c)->color = (int)i;
-            _contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
-            cv::Mat ci = _contours.getMat((int)i);
-            CV_Assert( ci.isContinuous() );
-            cvCvtSeqToArray(c, ci.data);
-        }
-    }
-
-    int findForegroundRegions(cv::gpu::GpuMat& d_foreground, cv::Mat& h_foreground, std::vector< std::vector<cv::Point> >& foreground_regions,
-                              CvMemStorage* storage, const cv::gpu::FGDStatModel::Params& params)
-    {
-        int region_count = 0;
-
-        // Discard under-size foreground regions:
-
-        d_foreground.download(h_foreground);
-        IplImage ipl_foreground = h_foreground;
-        CvSeq* first_seq = 0;
-
-        cvFindContours(&ipl_foreground, storage, &first_seq, sizeof(CvContour), CV_RETR_LIST);
-
-        for (CvSeq* seq = first_seq; seq; seq = seq->h_next)
-        {
-            CvContour* cnt = reinterpret_cast<CvContour*>(seq);
-
-            if (cnt->rect.width * cnt->rect.height < params.minArea || (params.is_obj_without_holes && CV_IS_SEQ_HOLE(seq)))
-            {
-                // Delete under-size contour:
-                CvSeq* prev_seq = seq->h_prev;
-                if (prev_seq)
-                {
-                    prev_seq->h_next = seq->h_next;
-
-                    if (seq->h_next)
-                        seq->h_next->h_prev = prev_seq;
-                }
-                else
-                {
-                    first_seq = seq->h_next;
-
-                    if (seq->h_next)
-                        seq->h_next->h_prev = NULL;
-                }
-            }
-            else
-            {
-                region_count++;
-            }
-        }
-
-        seqToContours(first_seq, storage, foreground_regions);
-        h_foreground.setTo(0);
-
-        cv::drawContours(h_foreground, foreground_regions, -1, cv::Scalar::all(255), -1);
-
-        d_foreground.upload(h_foreground);
-
-        return region_count;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// updateBackgroundModel
-
-namespace
-{
-    void updateBackgroundModel(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
-                               const cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& background,
-                               const cv::gpu::FGDStatModel::Params& params)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd,
-                               cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
-                               int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-        static const func_t funcs[4][4][4] =
-        {
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar4>},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar4>}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar4>},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar4>}
-            }
-        };
-
-        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
-        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][background.channels() - 1](
-                    prevFrame, curFrame, Ftd, Fbd, foreground, background,
-                    deltaC, deltaCC, params.alpha1, params.alpha2, params.alpha3, params.N1c, params.N1cc, params.N2c, params.N2cc, params.T,
-                    0);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// Impl::update
-
-int cv::gpu::FGDStatModel::Impl::update(const cv::gpu::GpuMat& curFrame)
-{
-    CV_Assert(curFrame.type() == CV_8UC3 || curFrame.type() == CV_8UC4);
-    CV_Assert(curFrame.size() == prevFrame_.size());
-
-    cvClearMemStorage(storage_);
-    foreground_regions_.clear();
-    foreground_.setTo(cv::Scalar::all(0));
-
-    changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
-    changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);
-
-    int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, out_cn_);
-
-    if (params_.perform_morphing > 0)
-        smoothForeground(foreground_, filterBrd_, buf_, erodeFilter_, dilateFilter_, params_);
-
-    int region_count = 0;
-    if (params_.minArea > 0 || params_.is_obj_without_holes)
-        region_count = findForegroundRegions(foreground_, h_foreground_, foreground_regions_, storage_, params_);
-
-    // Check ALL BG update condition:
-    const double BGFG_FGD_BG_UPDATE_TRESH = 0.5;
-    if (static_cast<double>(FG_pixels_count) / Ftd_.size().area() > BGFG_FGD_BG_UPDATE_TRESH)
-        stat_.setTrained();
-
-    updateBackgroundModel(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, background_, params_);
-
-    copyChannels(curFrame, prevFrame_);
-
-    return region_count;
-}
-
-namespace
-{
-    // Default parameters of foreground detection algorithm:
-    const int BGFG_FGD_LC  = 128;
-    const int BGFG_FGD_N1C = 15;
-    const int BGFG_FGD_N2C = 25;
-
-    const int BGFG_FGD_LCC   = 64;
-    const int BGFG_FGD_N1CC = 25;
-    const int BGFG_FGD_N2CC = 40;
-
-    // Background reference image update parameter:
-    const float BGFG_FGD_ALPHA_1 = 0.1f;
-
-    // stat model update parameter
-    // 0.002f ~ 1K frame(~45sec), 0.005 ~ 18sec (if 25fps and absolutely static BG)
-    const float BGFG_FGD_ALPHA_2 = 0.005f;
-
-    // start value for alpha parameter (to fast initiate statistic model)
-    const float BGFG_FGD_ALPHA_3 = 0.1f;
-
-    const float BGFG_FGD_DELTA = 2.0f;
-
-    const float BGFG_FGD_T = 0.9f;
-
-    const float BGFG_FGD_MINAREA= 15.0f;
-}
-
-cv::gpu::FGDStatModel::Params::Params()
-{
-    Lc      = BGFG_FGD_LC;
-    N1c     = BGFG_FGD_N1C;
-    N2c     = BGFG_FGD_N2C;
-
-    Lcc     = BGFG_FGD_LCC;
-    N1cc    = BGFG_FGD_N1CC;
-    N2cc    = BGFG_FGD_N2CC;
-
-    delta   = BGFG_FGD_DELTA;
-
-    alpha1  = BGFG_FGD_ALPHA_1;
-    alpha2  = BGFG_FGD_ALPHA_2;
-    alpha3  = BGFG_FGD_ALPHA_3;
-
-    T       = BGFG_FGD_T;
-    minArea = BGFG_FGD_MINAREA;
-
-    is_obj_without_holes = true;
-    perform_morphing     = 1;
-}
-
-cv::gpu::FGDStatModel::FGDStatModel(int out_cn)
-{
-    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
-}
-
-cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params, int out_cn)
-{
-    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
-    create(firstFrame, params);
-}
-
-cv::gpu::FGDStatModel::~FGDStatModel()
-{
-}
-
-void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat& firstFrame, const Params& params)
-{
-    impl_->create(firstFrame, params);
-}
-
-void cv::gpu::FGDStatModel::release()
-{
-    impl_->release();
-}
-
-int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat& curFrame)
-{
-    return impl_->update(curFrame);
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/optflowbm.cpp
+++ b/modules/gpu/src/optflowbm.cpp
@@ -1,242 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::calcOpticalFlowBM(const GpuMat&, const GpuMat&, Size, Size, Size, bool, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-namespace optflowbm
-{
-    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
-              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream);
-}
-
-void cv::gpu::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
-{
-    CV_Assert( prev.type() == CV_8UC1 );
-    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );
-
-    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
-                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);
-
-    velx.create(velSize, CV_32FC1);
-    vely.create(velSize, CV_32FC1);
-
-    // scanning scheme coordinates
-    std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
-    int ssCount = 0;
-
-    // Calculate scanning scheme
-    const int minCount = std::min(maxRange.width, maxRange.height);
-
-    // use spiral search pattern
-    //
-    //     9 10 11 12
-    //     8  1  2 13
-    //     7  *  3 14
-    //     6  5  4 15
-    //... 20 19 18 17
-    //
-
-    for (int i = 0; i < minCount; ++i)
-    {
-        // four cycles along sides
-        int x = -i - 1, y = x;
-
-        // upper side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = ++x;
-            ss[ssCount].y = y;
-        }
-
-        // right side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = x;
-            ss[ssCount].y = ++y;
-        }
-
-        // bottom side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = --x;
-            ss[ssCount].y = y;
-        }
-
-        // left side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = x;
-            ss[ssCount].y = --y;
-        }
-    }
-
-    // the rest part
-    if (maxRange.width < maxRange.height)
-    {
-        const int xleft = -minCount;
-
-        // cycle by neighbor rings
-        for (int i = minCount; i < maxRange.height; ++i)
-        {
-            // two cycles by x
-            int y = -(i + 1);
-            int x = xleft;
-
-            // upper side
-            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-
-            x = xleft;
-            y = -y;
-
-            // bottom side
-            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-        }
-    }
-    else if (maxRange.width > maxRange.height)
-    {
-        const int yupper = -minCount;
-
-        // cycle by neighbor rings
-        for (int i = minCount; i < maxRange.width; ++i)
-        {
-            // two cycles by y
-            int x = -(i + 1);
-            int y = yupper;
-
-            // left side
-            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-
-            y = yupper;
-            x = -x;
-
-            // right side
-            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-        }
-    }
-
-    const cudaStream_t stream = StreamAccessor::getStream(st);
-
-    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
-    if (stream == 0)
-        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
-    else
-        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );
-
-    const int maxX = prev.cols - blockSize.width;
-    const int maxY = prev.rows - blockSize.height;
-
-    const int SMALL_DIFF = 2;
-    const int BIG_DIFF = 128;
-
-    const int blSize = blockSize.area();
-    const int acceptLevel = blSize * SMALL_DIFF;
-    const int escapeLevel = blSize * BIG_DIFF;
-
-    optflowbm::calc(prev, curr, velx, vely,
-                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
-                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
-}
-
-namespace optflowbm_fast
-{
-    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
-
-    template <typename T>
-    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
-}
-
-void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window, int block_window, Stream& stream)
-{
-    CV_Assert( I0.type() == CV_8UC1 );
-    CV_Assert( I1.size() == I0.size() && I1.type() == I0.type() );
-
-    int border_size = search_window / 2 + block_window / 2;
-    Size esize = I0.size() + Size(border_size, border_size) * 2;
-
-    ensureSizeIsEnough(esize, I0.type(), extended_I0);
-    ensureSizeIsEnough(esize, I0.type(), extended_I1);
-
-    copyMakeBorder(I0, extended_I0, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
-    copyMakeBorder(I1, extended_I1, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
-
-    GpuMat I0_hdr = extended_I0(Rect(Point2i(border_size, border_size), I0.size()));
-    GpuMat I1_hdr = extended_I1(Rect(Point2i(border_size, border_size), I0.size()));
-
-    int bcols, brows;
-    optflowbm_fast::get_buffer_size(I0.cols, I0.rows, search_window, block_window, bcols, brows);
-
-    ensureSizeIsEnough(brows, bcols, CV_32SC1, buffer);
-
-    flowx.create(I0.size(), CV_32FC1);
-    flowy.create(I0.size(), CV_32FC1);
-
-    optflowbm_fast::calc<uchar>(I0_hdr, I1_hdr, flowx, flowy, buffer, search_window, block_window, StreamAccessor::getStream(stream));
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -1,237 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::createOpticalFlowNeedleMap(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-
-#else
-
-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
-    {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-
-        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
-
-        return gpuCounter.maxSize();
-    }
-}
-
-namespace
-{
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
-
-void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
-
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
-
-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
-
-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
-
-    NCVBroxOpticalFlowDescriptor desc;
-
-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
-
-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
-
-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
-
-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
-
-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
-
-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
-
-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
-
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
-
-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
-}
-
-void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
-                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
-{
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
-    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
-    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
-    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
-    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
-
-    newFrame.create(frame0.size(), frame0.type());
-
-    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
-    buf.setTo(Scalar::all(0));
-
-    // occlusion masks
-    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
-    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
-
-    // interpolated forward flow
-    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
-    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
-
-    // interpolated backward flow
-    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
-    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
-
-    size_t step = frame0.step;
-
-    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStStreamHandler h(stream);
-
-    NppStInterpolationState state;
-
-    state.size         = NcvSize32u(frame0.cols, frame0.rows);
-    state.nStep        = static_cast<Ncv32u>(step);
-    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
-    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
-    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
-    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
-    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
-    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
-    state.pos          = pos;
-    state.pNewFrame    = newFrame.ptr<Ncv32f>();
-    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
-    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
-    state.ppBuffers[2] = fui.ptr<Ncv32f>();
-    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
-    state.ppBuffers[4] = bui.ptr<Ncv32f>();
-    state.ppBuffers[5] = bvi.ptr<Ncv32f>();
-
-    ncvSafeCall( nppiStInterpolateFrames(&state) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace optical_flow
-    {
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg);
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale);
-    }
-}}}
-
-void cv::gpu::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors)
-{
-    using namespace cv::gpu::cudev::optical_flow;
-
-    CV_Assert(u.type() == CV_32FC1);
-    CV_Assert(v.type() == u.type() && v.size() == u.size());
-
-    const int NEEDLE_MAP_SCALE = 16;
-
-    const int x_needles = u.cols / NEEDLE_MAP_SCALE;
-    const int y_needles = u.rows / NEEDLE_MAP_SCALE;
-
-    GpuMat u_avg(y_needles, x_needles, CV_32FC1);
-    GpuMat v_avg(y_needles, x_needles, CV_32FC1);
-
-    NeedleMapAverage_gpu(u, v, u_avg, v_avg);
-
-    const int NUM_VERTS_PER_ARROW = 6;
-
-    const int num_arrows = x_needles * y_needles * NUM_VERTS_PER_ARROW;
-
-    vertex.create(1, num_arrows, CV_32FC3);
-    colors.create(1, num_arrows, CV_32FC3);
-
-    colors.setTo(Scalar::all(1.0));
-
-    double uMax, vMax;
-    minMax(u_avg, 0, &uMax);
-    minMax(v_avg, 0, &vMax);
-
-    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));
-
-    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);
-
-    cvtColor(colors, colors, COLOR_HSV2RGB);
-}
-
-#endif /* HAVE_CUDA */
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@@ -1,409 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#define MIN_SIZE 32
-
-#define S(x) StreamAccessor::getStream(x)
-
-// GPU resize() is fast, but it differs from the CPU analog. Disabling this flag
-// leads to an inefficient code. It's for debug purposes only.
-#define ENABLE_GPU_RESIZE 1
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-{
-    void setPolynomialExpansionConsts(
-            int polyN, const float *g, const float *xg, const float *xxg,
-            float ig11, float ig03, float ig33, float ig55);
-
-    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream);
-
-    void setUpdateMatricesConsts();
-
-    void updateMatricesGpu(
-            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
-            PtrStepSzf M, cudaStream_t stream);
-
-    void updateFlowGpu(
-            const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);
-
-    /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/
-
-    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
-
-    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
-
-    void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
-
-    void gaussianBlurGpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
-
-    void gaussianBlur5Gpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
-
-    void gaussianBlur5Gpu_CC11(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
-
-}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-
-
-void cv::gpu::FarnebackOpticalFlow::prepareGaussian(
-        int n, double sigma, float *g, float *xg, float *xxg,
-        double &ig11, double &ig03, double &ig33, double &ig55)
-{
-    double s = 0.;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
-        s += g[x];
-    }
-
-    s = 1./s;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)(g[x]*s);
-        xg[x] = (float)(x*g[x]);
-        xxg[x] = (float)(x*x*g[x]);
-    }
-
-    Mat_<double> G(6, 6);
-    G.setTo(0);
-
-    for (int y = -n; y <= n; y++)
-    {
-        for (int x = -n; x <= n; x++)
-        {
-            G(0,0) += g[y]*g[x];
-            G(1,1) += g[y]*g[x]*x*x;
-            G(3,3) += g[y]*g[x]*x*x*x*x;
-            G(5,5) += g[y]*g[x]*x*x*y*y;
-        }
-    }
-
-    //G[0][0] = 1.;
-    G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
-    G(4,4) = G(3,3);
-    G(3,4) = G(4,3) = G(5,5);
-
-    // invG:
-    // [ x        e  e    ]
-    // [    y             ]
-    // [       y          ]
-    // [ e        z       ]
-    // [ e           z    ]
-    // [                u ]
-    Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
-
-    ig11 = invG(1,1);
-    ig03 = invG(0,3);
-    ig33 = invG(3,3);
-    ig55 = invG(5,5);
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
-{
-    std::vector<float> buf(n*6 + 3);
-    float* g = &buf[0] + n;
-    float* xg = g + n*2 + 1;
-    float* xxg = xg + n*2 + 1;
-
-    if (sigma < FLT_EPSILON)
-        sigma = n*0.3;
-
-    double ig11, ig03, ig33, ig55;
-    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
-
-    cudev::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        cudev::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
-    else
-        cudev::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
-    swap(M, bufM);
-
-    for (int i = 1; i < 5; ++i)
-        streams[i].waitForCompletion();
-    cudev::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        cudev::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        cudev::optflow_farneback::gaussianBlur5Gpu(
-                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
-    else
-        cudev::optflow_farneback::gaussianBlur5Gpu_CC11(
-                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
-    swap(M, bufM);
-
-    cudev::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        cudev::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::operator ()(
-        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
-{
-    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
-    CV_Assert(frame0.size() == frame1.size());
-    CV_Assert(polyN == 5 || polyN == 7);
-    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
-
-    Stream streams[5];
-    if (S(s))
-        streams[0] = s;
-
-    Size size = frame0.size();
-    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
-
-    flowx.create(size, CV_32F);
-    flowy.create(size, CV_32F);
-    GpuMat flowx0 = flowx;
-    GpuMat flowy0 = flowy;
-
-    // Crop unnecessary levels
-    double scale = 1;
-    int numLevelsCropped = 0;
-    for (; numLevelsCropped < numLevels; numLevelsCropped++)
-    {
-        scale *= pyrScale;
-        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
-            break;
-    }
-
-    streams[0].enqueueConvert(frame0, frames_[0], CV_32F);
-    streams[1].enqueueConvert(frame1, frames_[1], CV_32F);
-
-    if (fastPyramids)
-    {
-        // Build Gaussian pyramids using pyrDown()
-        pyramid0_.resize(numLevelsCropped + 1);
-        pyramid1_.resize(numLevelsCropped + 1);
-        pyramid0_[0] = frames_[0];
-        pyramid1_[0] = frames_[1];
-        for (int i = 1; i <= numLevelsCropped; ++i)
-        {
-            pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
-            pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
-        }
-    }
-
-    setPolynomialExpansionConsts(polyN, polySigma);
-    cudev::optflow_farneback::setUpdateMatricesConsts();
-
-    for (int k = numLevelsCropped; k >= 0; k--)
-    {
-        streams[0].waitForCompletion();
-
-        scale = 1;
-        for (int i = 0; i < k; i++)
-            scale *= pyrScale;
-
-        double sigma = (1./scale - 1) * 0.5;
-        int smoothSize = cvRound(sigma*5) | 1;
-        smoothSize = std::max(smoothSize, 3);
-
-        int width = cvRound(size.width*scale);
-        int height = cvRound(size.height*scale);
-
-        if (fastPyramids)
-        {
-            width = pyramid0_[k].cols;
-            height = pyramid0_[k].rows;
-        }
-
-        if (k > 0)
-        {
-            curFlowX.create(height, width, CV_32F);
-            curFlowY.create(height, width, CV_32F);
-        }
-        else
-        {
-            curFlowX = flowx0;
-            curFlowY = flowy0;
-        }
-
-        if (!prevFlowX.data)
-        {
-            if (flags & OPTFLOW_USE_INITIAL_FLOW)
-            {
-#if ENABLE_GPU_RESIZE
-                resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-                resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-                streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale);
-                streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale);
-#else
-                Mat tmp1, tmp2;
-                flowx0.download(tmp1);
-                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
-                tmp2 *= scale;
-                curFlowX.upload(tmp2);
-                flowy0.download(tmp1);
-                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
-                tmp2 *= scale;
-                curFlowY.upload(tmp2);
-#endif
-            }
-            else
-            {
-                streams[0].enqueueMemSet(curFlowX, 0);
-                streams[1].enqueueMemSet(curFlowY, 0);
-            }
-        }
-        else
-        {
-#if ENABLE_GPU_RESIZE
-            resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-            resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-            streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale);
-            streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale);
-#else
-            Mat tmp1, tmp2;
-            prevFlowX.download(tmp1);
-            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
-            tmp2 *= 1./pyrScale;
-            curFlowX.upload(tmp2);
-            prevFlowY.download(tmp1);
-            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
-            tmp2 *= 1./pyrScale;
-            curFlowY.upload(tmp2);
-#endif
-        }
-
-        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
-        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
-        GpuMat R[2] =
-        {
-            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
-            allocMatFromBuf(5*height, width, CV_32F, R_[1])
-        };
-
-        if (fastPyramids)
-        {
-            cudev::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
-            cudev::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
-        }
-        else
-        {
-            GpuMat blurredFrame[2] =
-            {
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
-            };
-            GpuMat pyrLevel[2] =
-            {
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
-            };
-
-            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
-            cudev::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
-
-            for (int i = 0; i < 2; i++)
-            {
-                cudev::optflow_farneback::gaussianBlurGpu(
-                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101_GPU, S(streams[i]));
-#if ENABLE_GPU_RESIZE
-                resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]);
-#else
-                Mat tmp1, tmp2;
-                tmp[i].download(tmp1);
-                resize(tmp1, tmp2, Size(width, height), INTER_LINEAR);
-                I[i].upload(tmp2);
-#endif
-                cudev::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
-            }
-        }
-
-        streams[1].waitForCompletion();
-        cudev::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));
-
-        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-        {
-            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
-            cudev::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
-        }
-        for (int i = 0; i < numIters; i++)
-        {
-            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
-            else
-                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
-        }
-
-        prevFlowX = curFlowX;
-        prevFlowY = curFlowY;
-    }
-
-    flowx = curFlowX;
-    flowy = curFlowY;
-
-    if (!S(s))
-        streams[0].waitForCompletion();
-}
-
-#endif
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -1,250 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
-void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace pyrlk
-{
-    void loadConstants(int2 winSize, int iters);
-
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-}
-
-cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
-{
-    winSize = Size(21, 21);
-    maxLevel = 3;
-    iters = 30;
-    useInitialFlow = false;
-}
-
-namespace
-{
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
-    {
-        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
-        {
-            block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
-            block.y = 8;
-        }
-        else
-        {
-            block.x = 16;
-            block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
-        }
-
-        patch.x = (winSize.width  + block.x - 1) / block.x;
-        patch.y = (winSize.height + block.y - 1) / block.y;
-
-        block.z = patch.z = 1;
-    }
-}
-
-void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
-{
-    if (prevPts.empty())
-    {
-        nextPts.release();
-        status.release();
-        if (err) err->release();
-        return;
-    }
-
-    dim3 block, patch;
-    calcPatchSize(winSize, block, patch);
-
-    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
-    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
-
-    if (useInitialFlow)
-        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
-    else
-        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
-
-    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
-    GpuMat temp2 = nextPts.reshape(1);
-    multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
-
-    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
-    status.setTo(Scalar::all(1));
-
-    if (err)
-        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
-
-    // build the image pyramids.
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    int cn = prevImg.channels();
-
-    if (cn == 1 || cn == 4)
-    {
-        prevImg.convertTo(prevPyr_[0], CV_32F);
-        nextImg.convertTo(nextPyr_[0], CV_32F);
-    }
-    else
-    {
-        cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(prevPyr_[0], CV_32F);
-
-        cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(nextPyr_[0], CV_32F);
-    }
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
-
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        if (cn == 1)
-        {
-            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
-        }
-        else
-        {
-            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
-        }
-    }
-}
-
-void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
-{
-    CV_Assert(prevImg.type() == CV_8UC1);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-
-    if (err)
-        err->create(prevImg.size(), CV_32FC1);
-
-    // build the image pyramids.
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    prevPyr_[0] = prevImg;
-    nextImg.convertTo(nextPyr_[0], CV_32F);
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
-    uPyr_[0].setTo(Scalar::all(0));
-    vPyr_[0].setTo(Scalar::all(0));
-    uPyr_[1].setTo(Scalar::all(0));
-    vPyr_[1].setTo(Scalar::all(0));
-
-    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    pyrlk::loadConstants(winSize2i, iters);
-
-    PtrStepSzf derr = err ? *err : PtrStepSzf();
-
-    int idx = 0;
-
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        int idx2 = (idx + 1) & 1;
-
-        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-            level == 0 ? derr : PtrStepSzf(), winSize2i);
-
-        if (level > 0)
-            idx = idx2;
-    }
-
-    uPyr_[idx].copyTo(u);
-    vPyr_[idx].copyTo(v);
-}
-
-void cv::gpu::PyrLKOpticalFlow::releaseMemory()
-{
-    prevPyr_.clear();
-    nextPyr_.clear();
-
-    buf_.release();
-
-    uPyr_[0].release();
-    vPyr_[0].release();
-
-    uPyr_[1].release();
-    vPyr_[1].release();
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/tvl1flow.cpp
+++ b/modules/gpu/src/tvl1flow.cpp
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() { throw_no_cuda(); }
-void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage() {}
-void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-
-#else
-
-using namespace cv;
-using namespace cv::gpu;
-
-cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU()
-{
-    tau            = 0.25;
-    lambda         = 0.15;
-    theta          = 0.3;
-    nscales        = 5;
-    warps          = 5;
-    epsilon        = 0.01;
-    iterations     = 300;
-    scaleStep      = 0.8;
-    useInitialFlow = false;
-}
-
-void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
-{
-    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
-    CV_Assert( I0.size() == I1.size() );
-    CV_Assert( I0.type() == I1.type() );
-    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
-    CV_Assert( nscales > 0 );
-
-    // allocate memory for the pyramid structure
-    I0s.resize(nscales);
-    I1s.resize(nscales);
-    u1s.resize(nscales);
-    u2s.resize(nscales);
-
-    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
-
-    if (!useInitialFlow)
-    {
-        flowx.create(I0.size(), CV_32FC1);
-        flowy.create(I0.size(), CV_32FC1);
-    }
-
-    u1s[0] = flowx;
-    u2s[0] = flowy;
-
-    I1x_buf.create(I0.size(), CV_32FC1);
-    I1y_buf.create(I0.size(), CV_32FC1);
-
-    I1w_buf.create(I0.size(), CV_32FC1);
-    I1wx_buf.create(I0.size(), CV_32FC1);
-    I1wy_buf.create(I0.size(), CV_32FC1);
-
-    grad_buf.create(I0.size(), CV_32FC1);
-    rho_c_buf.create(I0.size(), CV_32FC1);
-
-    p11_buf.create(I0.size(), CV_32FC1);
-    p12_buf.create(I0.size(), CV_32FC1);
-    p21_buf.create(I0.size(), CV_32FC1);
-    p22_buf.create(I0.size(), CV_32FC1);
-
-    diff_buf.create(I0.size(), CV_32FC1);
-
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
-    {
-        gpu::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
-        gpu::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
-
-        if (I0s[s].cols < 16 || I0s[s].rows < 16)
-        {
-            nscales = s;
-            break;
-        }
-
-        if (useInitialFlow)
-        {
-            gpu::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
-            gpu::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
-
-            gpu::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
-            gpu::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
-        }
-        else
-        {
-            u1s[s].create(I0s[s].size(), CV_32FC1);
-            u2s[s].create(I0s[s].size(), CV_32FC1);
-        }
-    }
-
-    if (!useInitialFlow)
-    {
-        u1s[nscales-1].setTo(Scalar::all(0));
-        u2s[nscales-1].setTo(Scalar::all(0));
-    }
-
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
-
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
-
-        // otherwise, upsample the optical flow
-
-        // zoom the optical flow for the next finer scale
-        gpu::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
-        gpu::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
-
-        // scale the optical flow with the appropriate zoom factor
-        gpu::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
-        gpu::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
-    }
-}
-
-namespace tvl1flow
-{
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
-    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
-                   PtrStepSzf grad, PtrStepSzf rho_c,
-                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
-                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
-                   float l_t, float theta);
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut);
-}
-
-void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2)
-{
-    using namespace tvl1flow;
-
-    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
-
-    CV_DbgAssert( I1.size() == I0.size() );
-    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == u1.size() );
-
-    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    centeredGradient(I1, I1x, I1y);
-
-    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-
-    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
-
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
-
-        double error = std::numeric_limits<double>::max();
-        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
-        {
-            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));
-
-            error = gpu::sum(diff, norm_buf)[0];
-
-            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
-        }
-    }
-}
-
-void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage()
-{
-    I0s.clear();
-    I1s.clear();
-    u1s.clear();
-    u2s.clear();
-
-    I1x_buf.release();
-    I1y_buf.release();
-
-    I1w_buf.release();
-    I1wx_buf.release();
-    I1wy_buf.release();
-
-    grad_buf.release();
-    rho_c_buf.release();
-
-    p11_buf.release();
-    p12_buf.release();
-    p21_buf.release();
-    p22_buf.release();
-
-    diff_buf.release();
-    norm_buf.release();
-}
-
-#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)