gpuvideo module for video processing

2013-04-10 10:59:25 +04:00
parent 7544ddbfef
commit fc1fa28556
41 changed files with 1033 additions and 525 deletions
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -4,7 +4,9 @@ endif()

 set(the_description "GPU-accelerated Computer Vision")

-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy opencv_gpuarithm opencv_gpufilters opencv_gpuimgproc opencv_gpufeatures2d OPTIONAL opencv_gpunvidia)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy
+                   opencv_gpuarithm opencv_gpufilters opencv_gpuimgproc opencv_gpufeatures2d opencv_gpuvideo
+                   OPTIONAL opencv_gpunvidia)

 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

--- a/modules/gpu/doc/gpu.rst
+++ b/modules/gpu/doc/gpu.rst
@@ -10,4 +10,3 @@ gpu. GPU-accelerated Computer Vision
    data_structures
    object_detection
    camera_calibration_and_3d_reconstruction
-    video
--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@@ -1,695 +0,0 @@
-Video Analysis
-==============
-
-.. highlight:: cpp
-
-
-
-gpu::BroxOpticalFlow
--------------------
-.. ocv:class:: gpu::BroxOpticalFlow
-
-Class computing the optical flow for two images using Brox et al Optical Flow algorithm ([Brox2004]_). ::
-
-    class BroxOpticalFlow
-    {
-    public:
-        BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_);
-
-        //! Compute optical flow
-        //! frame0 - source frame (supports only CV_32FC1 type)
-        //! frame1 - frame to track (with the same size and type as frame0)
-        //! u      - flow horizontal component (along x axis)
-        //! v      - flow vertical component (along y axis)
-        void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());
-
-        //! flow smoothness
-        float alpha;
-
-        //! gradient constancy importance
-        float gamma;
-
-        //! pyramid scale factor
-        float scale_factor;
-
-        //! number of lagged non-linearity iterations (inner loop)
-        int inner_iterations;
-
-        //! number of warping iterations (number of pyramid levels)
-        int outer_iterations;
-
-        //! number of linear system solver iterations
-        int solver_iterations;
-
-        GpuMat buf;
-    };
-
-
-
-gpu::GoodFeaturesToTrackDetector_GPU
------------------------------------
-.. ocv:class:: gpu::GoodFeaturesToTrackDetector_GPU
-
-Class used for strong corners detection on an image. ::
-
-    class GoodFeaturesToTrackDetector_GPU
-    {
-    public:
-        explicit GoodFeaturesToTrackDetector_GPU(int maxCorners_ = 1000, double qualityLevel_ = 0.01, double minDistance_ = 0.0,
-            int blockSize_ = 3, bool useHarrisDetector_ = false, double harrisK_ = 0.04);
-
-        void operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask = GpuMat());
-
-        int maxCorners;
-        double qualityLevel;
-        double minDistance;
-
-        int blockSize;
-        bool useHarrisDetector;
-        double harrisK;
-
-        void releaseMemory();
-    };
-
-The class finds the most prominent corners in the image.
-
-.. seealso:: :ocv:func:`goodFeaturesToTrack`
-
-
-
-gpu::GoodFeaturesToTrackDetector_GPU::GoodFeaturesToTrackDetector_GPU
---------------------------------------------------------------------
-Constructor.
-
-.. ocv:function:: gpu::GoodFeaturesToTrackDetector_GPU::GoodFeaturesToTrackDetector_GPU(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0, int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04)
-
-    :param maxCorners: Maximum number of corners to return. If there are more corners than are found, the strongest of them is returned.
-
-    :param qualityLevel: Parameter characterizing the minimal accepted quality of image corners. The parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue (see  :ocv:func:`gpu::cornerMinEigenVal` ) or the Harris function response (see  :ocv:func:`gpu::cornerHarris` ). The corners with the quality measure less than the product are rejected. For example, if the best corner has the quality measure = 1500, and the  ``qualityLevel=0.01`` , then all the corners with the quality measure less than 15 are rejected.
-
-    :param minDistance: Minimum possible Euclidean distance between the returned corners.
-
-    :param blockSize: Size of an average block for computing a derivative covariation matrix over each pixel neighborhood. See  :ocv:func:`cornerEigenValsAndVecs` .
-
-    :param useHarrisDetector: Parameter indicating whether to use a Harris detector (see :ocv:func:`gpu::cornerHarris`) or :ocv:func:`gpu::cornerMinEigenVal`.
-
-    :param harrisK: Free parameter of the Harris detector.
-
-
-
-gpu::GoodFeaturesToTrackDetector_GPU::operator ()
-------------------------------------------------
-Finds the most prominent corners in the image.
-
-.. ocv:function:: void gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask = GpuMat())
-
-    :param image: Input 8-bit, single-channel image.
-
-    :param corners: Output vector of detected corners (it will be one row matrix with CV_32FC2 type).
-
-    :param mask: Optional region of interest. If the image is not empty (it needs to have the type  ``CV_8UC1``  and the same size as  ``image`` ), it  specifies the region in which the corners are detected.
-
-.. seealso:: :ocv:func:`goodFeaturesToTrack`
-
-
-
-gpu::GoodFeaturesToTrackDetector_GPU::releaseMemory
---------------------------------------------------
-Releases inner buffers memory.
-
-.. ocv:function:: void gpu::GoodFeaturesToTrackDetector_GPU::releaseMemory()
-
-
-
-gpu::FarnebackOpticalFlow
-------------------------
-.. ocv:class:: gpu::FarnebackOpticalFlow
-
-Class computing a dense optical flow using the Gunnar Farneback’s algorithm. ::
-
-    class CV_EXPORTS FarnebackOpticalFlow
-    {
-    public:
-        FarnebackOpticalFlow()
-        {
-            numLevels = 5;
-            pyrScale = 0.5;
-            fastPyramids = false;
-            winSize = 13;
-            numIters = 10;
-            polyN = 5;
-            polySigma = 1.1;
-            flags = 0;
-        }
-
-        int numLevels;
-        double pyrScale;
-        bool fastPyramids;
-        int winSize;
-        int numIters;
-        int polyN;
-        double polySigma;
-        int flags;
-
-        void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
-
-        void releaseMemory();
-
-    private:
-        /* hidden */
-    };
-
-
-
-gpu::FarnebackOpticalFlow::operator ()
--------------------------------------
-Computes a dense optical flow using the Gunnar Farneback’s algorithm.
-
-.. ocv:function:: void gpu::FarnebackOpticalFlow::operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null())
-
-    :param frame0: First 8-bit gray-scale input image
-    :param frame1: Second 8-bit gray-scale input image
-    :param flowx: Flow horizontal component
-    :param flowy: Flow vertical component
-    :param s: Stream
-
-.. seealso:: :ocv:func:`calcOpticalFlowFarneback`
-
-
-
-gpu::FarnebackOpticalFlow::releaseMemory
----------------------------------------
-Releases unused auxiliary memory buffers.
-
-.. ocv:function:: void gpu::FarnebackOpticalFlow::releaseMemory()
-
-
-
-gpu::PyrLKOpticalFlow
---------------------
-.. ocv:class:: gpu::PyrLKOpticalFlow
-
-Class used for calculating an optical flow. ::
-
-    class PyrLKOpticalFlow
-    {
-    public:
-        PyrLKOpticalFlow();
-
-        void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
-            GpuMat& status, GpuMat* err = 0);
-
-        void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
-
-        Size winSize;
-        int maxLevel;
-        int iters;
-        bool useInitialFlow;
-
-        void releaseMemory();
-    };
-
-The class can calculate an optical flow for a sparse feature set or dense optical flow using the iterative Lucas-Kanade method with pyramids.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-
-
-gpu::PyrLKOpticalFlow::sparse
-----------------------------
-Calculate an optical flow for a sparse feature set.
-
-.. ocv:function:: void gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err = 0)
-
-    :param prevImg: First 8-bit input image (supports both grayscale and color images).
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param prevPts: Vector of 2D points for which the flow needs to be found. It must be one row matrix with CV_32FC2 type.
-
-    :param nextPts: Output vector of 2D points (with single-precision floating-point coordinates) containing the calculated new positions of input features in the second image. When ``useInitialFlow`` is true, the vector must have the same size as in the input.
-
-    :param status: Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the flow for the corresponding features has been found. Otherwise, it is set to 0.
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-
-
-gpu::PyrLKOpticalFlow::dense
-----------------------------
-Calculate dense optical flow.
-
-.. ocv:function:: void gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0)
-
-    :param prevImg: First 8-bit grayscale input image.
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param u: Horizontal component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param v: Vertical component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-
-
-gpu::PyrLKOpticalFlow::releaseMemory
------------------------------------
-Releases inner buffers memory.
-
-.. ocv:function:: void gpu::PyrLKOpticalFlow::releaseMemory()
-
-
-
-gpu::interpolateFrames
----------------------
-Interpolates frames (images) using provided optical flow (displacement field).
-
-.. ocv:function:: void gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, float pos, GpuMat& newFrame, GpuMat& buf, Stream& stream = Stream::Null())
-
-    :param frame0: First frame (32-bit floating point images, single channel).
-
-    :param frame1: Second frame. Must have the same type and size as ``frame0`` .
-
-    :param fu: Forward horizontal displacement.
-
-    :param fv: Forward vertical displacement.
-
-    :param bu: Backward horizontal displacement.
-
-    :param bv: Backward vertical displacement.
-
-    :param pos: New frame position.
-
-    :param newFrame: Output image.
-
-    :param buf: Temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow, interpolated backward vertical flow.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::FGDStatModel
-----------------
-.. ocv:class:: gpu::FGDStatModel
-
-Class used for background/foreground segmentation. ::
-
-    class FGDStatModel
-    {
-    public:
-        struct Params
-        {
-            ...
-        };
-
-        explicit FGDStatModel(int out_cn = 3);
-        explicit FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params = Params(), int out_cn = 3);
-
-        ~FGDStatModel();
-
-        void create(const cv::gpu::GpuMat& firstFrame, const Params& params = Params());
-        void release();
-
-        int update(const cv::gpu::GpuMat& curFrame);
-
-        //8UC3 or 8UC4 reference background image
-        cv::gpu::GpuMat background;
-
-        //8UC1 foreground image
-        cv::gpu::GpuMat foreground;
-
-        std::vector< std::vector<cv::Point> > foreground_regions;
-    };
-
-  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [FGD2003]_.
-
-  The results are available through the class fields:
-
-    .. ocv:member:: cv::gpu::GpuMat background
-
-        The output background image.
-
-    .. ocv:member:: cv::gpu::GpuMat foreground
-
-        The output foreground mask as an 8-bit binary image.
-
-    .. ocv:member:: cv::gpu::GpuMat foreground_regions
-
-        The output foreground regions calculated by :ocv:func:`findContours`.
-
-
-
-gpu::FGDStatModel::FGDStatModel
-------------------------------
-Constructors.
-
-.. ocv:function:: gpu::FGDStatModel::FGDStatModel(int out_cn = 3)
-.. ocv:function:: gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params = Params(), int out_cn = 3)
-
-    :param firstFrame: First frame from video stream. Supports 3- and 4-channels input ( ``CV_8UC3`` and ``CV_8UC4`` ).
-
-    :param params: Algorithm's parameters. See [FGD2003]_ for explanation.
-
-    :param out_cn: Channels count in output result and inner buffers. Can be 3 or 4. 4-channels version requires more memory, but works a bit faster.
-
-.. seealso:: :ocv:func:`gpu::FGDStatModel::create`
-
-
-
-gpu::FGDStatModel::create
-------------------------
-Initializes background model.
-
-.. ocv:function:: void gpu::FGDStatModel::create(const cv::gpu::GpuMat& firstFrame, const Params& params = Params())
-
-    :param firstFrame: First frame from video stream. Supports 3- and 4-channels input ( ``CV_8UC3`` and ``CV_8UC4`` ).
-
-    :param params: Algorithm's parameters. See [FGD2003]_ for explanation.
-
-
-
-gpu::FGDStatModel::release
--------------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void gpu::FGDStatModel::release()
-
-
-
-gpu::FGDStatModel::update
--------------------------
-Updates the background model and returns foreground regions count.
-
-.. ocv:function:: int gpu::FGDStatModel::update(const cv::gpu::GpuMat& curFrame)
-
-    :param curFrame: Next video frame.
-
-
-
-gpu::MOG_GPU
------------
-.. ocv:class:: gpu::MOG_GPU
-
-Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm. ::
-
-    class MOG_GPU
-    {
-    public:
-        MOG_GPU(int nmixtures = -1);
-
-        void initialize(Size frameSize, int frameType);
-
-        void operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate = 0.0f, Stream& stream = Stream::Null());
-
-        void getBackgroundImage(GpuMat& backgroundImage, Stream& stream = Stream::Null()) const;
-
-        void release();
-
-        int history;
-        float varThreshold;
-        float backgroundRatio;
-        float noiseSigma;
-    };
-
-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2001]_.
-
-.. seealso:: :ocv:class:`BackgroundSubtractorMOG`
-
-
-
-gpu::MOG_GPU::MOG_GPU
---------------------
-The constructor.
-
-.. ocv:function:: gpu::MOG_GPU::MOG_GPU(int nmixtures = -1)
-
-    :param nmixtures: Number of Gaussian mixtures.
-
-Default constructor sets all parameters to default values.
-
-
-
-gpu::MOG_GPU::operator()
------------------------
-Updates the background model and returns the foreground mask.
-
-.. ocv:function:: void gpu::MOG_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate = 0.0f, Stream& stream = Stream::Null())
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::MOG_GPU::getBackgroundImage
--------------------------------
-Computes a background image.
-
-.. ocv:function:: void gpu::MOG_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream = Stream::Null()) const
-
-    :param backgroundImage: The output background image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::MOG_GPU::release
---------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void gpu::MOG_GPU::release()
-
-
-
-gpu::MOG2_GPU
-------------
-.. ocv:class:: gpu::MOG2_GPU
-
-Gaussian Mixture-based Background/Foreground Segmentation Algorithm. ::
-
-    class MOG2_GPU
-    {
-    public:
-        MOG2_GPU(int nmixtures = -1);
-
-        void initialize(Size frameSize, int frameType);
-
-        void operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate = 0.0f, Stream& stream = Stream::Null());
-
-        void getBackgroundImage(GpuMat& backgroundImage, Stream& stream = Stream::Null()) const;
-
-        void release();
-
-        // parameters
-        ...
-    };
-
-  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2004]_.
-
-  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
-
-    .. ocv:member:: float backgroundRatio
-
-        Threshold defining whether the component is significant enough to be included into the background model ( corresponds to ``TB=1-cf`` from the paper??which paper??). ``cf=0.1 => TB=0.9`` is default. For ``alpha=0.001``, it means that the mode should exist for approximately 105 frames before it is considered foreground.
-
-    .. ocv:member:: float varThreshold
-
-        Threshold for the squared Mahalanobis distance that helps decide when a sample is close to the existing components (corresponds to ``Tg``). If it is not close to any component, a new component is generated. ``3 sigma => Tg=3*3=9`` is default. A smaller ``Tg`` value generates more components. A higher ``Tg`` value may result in a small number of components but they can grow too large.
-
-    .. ocv:member:: float fVarInit
-
-        Initial variance for the newly generated components. It affects the speed of adaptation. The parameter value is based on your estimate of the typical standard deviation from the images. OpenCV uses 15 as a reasonable value.
-
-    .. ocv:member:: float fVarMin
-
-        Parameter used to further control the variance.
-
-    .. ocv:member:: float fVarMax
-
-        Parameter used to further control the variance.
-
-    .. ocv:member:: float fCT
-
-        Complexity reduction parameter. This parameter defines the number of samples needed to accept to prove the component exists. ``CT=0.05`` is a default value for all the samples. By setting ``CT=0`` you get an algorithm very similar to the standard Stauffer&Grimson algorithm.
-
-    .. ocv:member:: uchar nShadowDetection
-
-        The value for marking shadow pixels in the output foreground mask. Default value is 127.
-
-    .. ocv:member:: float fTau
-
-        Shadow threshold. The shadow is detected if the pixel is a darker version of the background. ``Tau`` is a threshold defining how much darker the shadow can be. ``Tau= 0.5`` means that if a pixel is more than twice darker then it is not shadow. See [ShadowDetect2003]_.
-
-    .. ocv:member:: bool bShadowDetection
-
-        Parameter defining whether shadow detection should be enabled.
-
-.. seealso:: :ocv:class:`BackgroundSubtractorMOG2`
-
-
-
-gpu::MOG2_GPU::MOG2_GPU
-----------------------
-The constructor.
-
-.. ocv:function:: gpu::MOG2_GPU::MOG2_GPU(int nmixtures = -1)
-
-    :param nmixtures: Number of Gaussian mixtures.
-
-Default constructor sets all parameters to default values.
-
-
-
-gpu::MOG2_GPU::operator()
-------------------------
-Updates the background model and returns the foreground mask.
-
-.. ocv:function:: void gpu::MOG2_GPU::operator()( const GpuMat& frame, GpuMat& fgmask, float learningRate=-1.0f, Stream& stream=Stream::Null() )
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::MOG2_GPU::getBackgroundImage
---------------------------------
-Computes a background image.
-
-.. ocv:function:: void gpu::MOG2_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream = Stream::Null()) const
-
-    :param backgroundImage: The output background image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::MOG2_GPU::release
----------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void gpu::MOG2_GPU::release()
-
-
-
-gpu::GMG_GPU
------------
-.. ocv:class:: gpu::GMG_GPU
-
-  Class used for background/foreground segmentation. ::
-
-    class GMG_GPU_GPU
-    {
-    public:
-        GMG_GPU();
-
-        void initialize(Size frameSize, float min = 0.0f, float max = 255.0f);
-
-        void operator ()(const GpuMat& frame, GpuMat& fgmask, float learningRate = -1.0f, Stream& stream = Stream::Null());
-
-        void release();
-
-        int    maxFeatures;
-        float  learningRate;
-        int    numInitializationFrames;
-        int    quantizationLevels;
-        float  backgroundPrior;
-        float  decisionThreshold;
-        int    smoothingRadius;
-
-        ...
-    };
-
-  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.
-
-  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
-
-    .. ocv:member:: int maxFeatures
-
-        Total number of distinct colors to maintain in histogram.
-
-    .. ocv:member:: float learningRate
-
-        Set between 0.0 and 1.0, determines how quickly features are "forgotten" from histograms.
-
-    .. ocv:member:: int numInitializationFrames
-
-        Number of frames of video to use to initialize histograms.
-
-    .. ocv:member:: int quantizationLevels
-
-        Number of discrete levels in each channel to be used in histograms.
-
-    .. ocv:member:: float backgroundPrior
-
-        Prior probability that any given pixel is a background pixel. A sensitivity parameter.
-
-    .. ocv:member:: float decisionThreshold
-
-        Value above which pixel is determined to be FG.
-
-    .. ocv:member:: float smoothingRadius
-
-        Smoothing radius, in pixels, for cleaning up FG image.
-
-
-
-gpu::GMG_GPU::GMG_GPU
---------------------
-The default constructor.
-
-.. ocv:function:: gpu::GMG_GPU::GMG_GPU()
-
-Default constructor sets all parameters to default values.
-
-
-
-gpu::GMG_GPU::initialize
------------------------
-Initialize background model and allocates all inner buffers.
-
-.. ocv:function:: void gpu::GMG_GPU::initialize(Size frameSize, float min = 0.0f, float max = 255.0f)
-
-    :param frameSize: Input frame size.
-
-    :param min: Minimum value taken on by pixels in image sequence. Usually 0.
-
-    :param max: Maximum value taken on by pixels in image sequence, e.g. 1.0 or 255.
-
-
-
-gpu::GMG_GPU::operator()
------------------------
-Updates the background model and returns the foreground mask
-
-.. ocv:function:: void gpu::GMG_GPU::operator ()( const GpuMat& frame, GpuMat& fgmask, float learningRate=-1.0f, Stream& stream=Stream::Null() )
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::GMG_GPU::release
---------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void gpu::GMG_GPU::release()
-
-
-
-.. [Brox2004] T. Brox, A. Bruhn, N. Papenberg, J. Weickert. *High accuracy optical flow estimation based on a theory for warping*. ECCV 2004.
-.. [FGD2003] Liyuan Li, Weimin Huang, Irene Y.H. Gu, and Qi Tian. *Foreground Object Detection from Videos Containing Complex Background*. ACM MM2003 9p, 2003.
-.. [MOG2001] P. KadewTraKuPong and R. Bowden. *An improved adaptive background mixture model for real-time tracking with shadow detection*. Proc. 2nd European Workshop on Advanced Video-Based Surveillance Systems, 2001
-.. [MOG2004] Z. Zivkovic. *Improved adaptive Gausian mixture model for background subtraction*. International Conference Pattern Recognition, UK, August, 2004
-.. [ShadowDetect2003] Prati, Mikic, Trivedi and Cucchiarra. *Detecting Moving Shadows...*. IEEE PAMI, 2003
-.. [GMG2012] A. Godbehere, A. Matsukawa and K. Goldberg. *Visual Tracking of Human Visitors under Variable-Lighting Conditions for a Responsive Audio Art Installation*. American Control Conference, Montreal, June 2012
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -54,6 +54,7 @@
 #include "opencv2/gpufilters.hpp"
 #include "opencv2/gpuimgproc.hpp"
 #include "opencv2/gpufeatures2d.hpp"
+#include "opencv2/gpuvideo.hpp"

 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
@@ -433,543 +434,23 @@ private:



-////////////////////////////////// Optical Flow //////////////////////////////////////////

-class CV_EXPORTS BroxOpticalFlow
-{
-public:
-    BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
-        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_),
-        inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
-    {
-    }

-    //! Compute optical flow
-    //! frame0 - source frame (supports only CV_32FC1 type)
-    //! frame1 - frame to track (with the same size and type as frame0)
-    //! u      - flow horizontal component (along x axis)
-    //! v      - flow vertical component (along y axis)
-    void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());

-    //! flow smoothness
-    float alpha;

-    //! gradient constancy importance
-    float gamma;

-    //! pyramid scale factor
-    float scale_factor;

-    //! number of lagged non-linearity iterations (inner loop)
-    int inner_iterations;

-    //! number of warping iterations (number of pyramid levels)
-    int outer_iterations;

-    //! number of linear system solver iterations
-    int solver_iterations;

-    GpuMat buf;
-};




-class CV_EXPORTS PyrLKOpticalFlow
-{
-public:
-    PyrLKOpticalFlow();

-    void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
-        GpuMat& status, GpuMat* err = 0);

-    void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);

-    void releaseMemory();

-    Size winSize;
-    int maxLevel;
-    int iters;
-    bool useInitialFlow;
-
-private:
-    std::vector<GpuMat> prevPyr_;
-    std::vector<GpuMat> nextPyr_;
-
-    GpuMat buf_;
-
-    GpuMat uPyr_[2];
-    GpuMat vPyr_[2];
-};
-
-
-class CV_EXPORTS FarnebackOpticalFlow
-{
-public:
-    FarnebackOpticalFlow()
-    {
-        numLevels = 5;
-        pyrScale = 0.5;
-        fastPyramids = false;
-        winSize = 13;
-        numIters = 10;
-        polyN = 5;
-        polySigma = 1.1;
-        flags = 0;
-    }
-
-    int numLevels;
-    double pyrScale;
-    bool fastPyramids;
-    int winSize;
-    int numIters;
-    int polyN;
-    double polySigma;
-    int flags;
-
-    void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
-
-    void releaseMemory()
-    {
-        frames_[0].release();
-        frames_[1].release();
-        pyrLevel_[0].release();
-        pyrLevel_[1].release();
-        M_.release();
-        bufM_.release();
-        R_[0].release();
-        R_[1].release();
-        blurredFrame_[0].release();
-        blurredFrame_[1].release();
-        pyramid0_.clear();
-        pyramid1_.clear();
-    }
-
-private:
-    void prepareGaussian(
-            int n, double sigma, float *g, float *xg, float *xxg,
-            double &ig11, double &ig03, double &ig33, double &ig55);
-
-    void setPolynomialExpansionConsts(int n, double sigma);
-
-    void updateFlow_boxFilter(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    void updateFlow_gaussianBlur(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    GpuMat frames_[2];
-    GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
-    std::vector<GpuMat> pyramid0_, pyramid1_;
-};
-
-
-// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
-//
-// see reference:
-//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-class CV_EXPORTS OpticalFlowDual_TVL1_GPU
-{
-public:
-    OpticalFlowDual_TVL1_GPU();
-
-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
-
-    void collectGarbage();
-
-    /**
-     * Time step of the numerical scheme.
-     */
-    double tau;
-
-    /**
-     * Weight parameter for the data term, attachment parameter.
-     * This is the most relevant parameter, which determines the smoothness of the output.
-     * The smaller this parameter is, the smoother the solutions we obtain.
-     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
-     */
-    double lambda;
-
-    /**
-     * Weight parameter for (u - v)^2, tightness parameter.
-     * It serves as a link between the attachment and the regularization terms.
-     * In theory, it should have a small value in order to maintain both parts in correspondence.
-     * The method is stable for a large range of values of this parameter.
-     */
-    double theta;
-
-    /**
-     * Number of scales used to create the pyramid of images.
-     */
-    int nscales;
-
-    /**
-     * Number of warpings per scale.
-     * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
-     * This is a parameter that assures the stability of the method.
-     * It also affects the running time, so it is a compromise between speed and accuracy.
-     */
-    int warps;
-
-    /**
-     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
-     * A small value will yield more accurate solutions at the expense of a slower convergence.
-     */
-    double epsilon;
-
-    /**
-     * Stopping criterion iterations number used in the numerical scheme.
-     */
-    int iterations;
-
-    double scaleStep;
-
-    bool useInitialFlow;
-
-private:
-    void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2);
-
-    std::vector<GpuMat> I0s;
-    std::vector<GpuMat> I1s;
-    std::vector<GpuMat> u1s;
-    std::vector<GpuMat> u2s;
-
-    GpuMat I1x_buf;
-    GpuMat I1y_buf;
-
-    GpuMat I1w_buf;
-    GpuMat I1wx_buf;
-    GpuMat I1wy_buf;
-
-    GpuMat grad_buf;
-    GpuMat rho_c_buf;
-
-    GpuMat p11_buf;
-    GpuMat p12_buf;
-    GpuMat p21_buf;
-    GpuMat p22_buf;
-
-    GpuMat diff_buf;
-    GpuMat norm_buf;
-};
-
-
-//! Calculates optical flow for 2 images using block matching algorithm */
-CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
-                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
-                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
-                                  Stream& stream = Stream::Null());
-
-class CV_EXPORTS FastOpticalFlowBM
-{
-public:
-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
-
-private:
-    GpuMat buffer;
-    GpuMat extended_I0;
-    GpuMat extended_I1;
-};
-
-
-//! Interpolate frames (images) using provided optical flow (displacement field).
-//! frame0   - frame 0 (32-bit floating point images, single channel)
-//! frame1   - frame 1 (the same type and size)
-//! fu       - forward horizontal displacement
-//! fv       - forward vertical displacement
-//! bu       - backward horizontal displacement
-//! bv       - backward vertical displacement
-//! pos      - new frame position
-//! newFrame - new frame
-//! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat;
-//!            occlusion masks            0, occlusion masks            1,
-//!            interpolated forward flow  0, interpolated forward flow  1,
-//!            interpolated backward flow 0, interpolated backward flow 1
-//!
-CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
-                                  const GpuMat& fu, const GpuMat& fv,
-                                  const GpuMat& bu, const GpuMat& bv,
-                                  float pos, GpuMat& newFrame, GpuMat& buf,
-                                  Stream& stream = Stream::Null());
-
-CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
-
-
-//////////////////////// Background/foreground segmentation ////////////////////////
-
-// Foreground Object Detection from Videos Containing Complex Background.
-// Liyuan Li, Weimin Huang, Irene Y.H. Gu, and Qi Tian.
-// ACM MM2003 9p
-class CV_EXPORTS FGDStatModel
-{
-public:
-    struct CV_EXPORTS Params
-    {
-        int Lc;  // Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
-        int N1c; // Number of color vectors used to model normal background color variation at a given pixel.
-        int N2c; // Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
-        // Used to allow the first N1c vectors to adapt over time to changing background.
-
-        int Lcc;  // Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
-        int N1cc; // Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
-        int N2cc; // Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
-        // Used to allow the first N1cc vectors to adapt over time to changing background.
-
-        bool is_obj_without_holes; // If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
-        int perform_morphing;     // Number of erode-dilate-erode foreground-blob cleanup iterations.
-        // These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.
-
-        float alpha1; // How quickly we forget old background pixel values seen. Typically set to 0.1.
-        float alpha2; // "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
-        float alpha3; // Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.
-
-        float delta;   // Affects color and color co-occurrence quantization, typically set to 2.
-        float T;       // A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
-        float minArea; // Discard foreground blobs whose bounding box is smaller than this threshold.
-
-        // default Params
-        Params();
-    };
-
-    // out_cn - channels count in output result (can be 3 or 4)
-    // 4-channels require more memory, but a bit faster
-    explicit FGDStatModel(int out_cn = 3);
-    explicit FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params = Params(), int out_cn = 3);
-
-    ~FGDStatModel();
-
-    void create(const cv::gpu::GpuMat& firstFrame, const Params& params = Params());
-    void release();
-
-    int update(const cv::gpu::GpuMat& curFrame);
-
-    //8UC3 or 8UC4 reference background image
-    cv::gpu::GpuMat background;
-
-    //8UC1 foreground image
-    cv::gpu::GpuMat foreground;
-
-    std::vector< std::vector<cv::Point> > foreground_regions;
-
-private:
-    FGDStatModel(const FGDStatModel&);
-    FGDStatModel& operator=(const FGDStatModel&);
-
-    class Impl;
-    std::auto_ptr<Impl> impl_;
-};
-
-/*!
- Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm
-
- The class implements the following algorithm:
- "An improved adaptive background mixture model for real-time tracking with shadow detection"
- P. KadewTraKuPong and R. Bowden,
- Proc. 2nd European Workshp on Advanced Video-Based Surveillance Systems, 2001."
- http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/avbs01/avbs01.pdf
-*/
-class CV_EXPORTS MOG_GPU
-{
-public:
-    //! the default constructor
-    MOG_GPU(int nmixtures = -1);
-
-    //! re-initiaization method
-    void initialize(Size frameSize, int frameType);
-
-    //! the update operator
-    void operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate = 0.0f, Stream& stream = Stream::Null());
-
-    //! computes a background image which are the mean of all background gaussians
-    void getBackgroundImage(GpuMat& backgroundImage, Stream& stream = Stream::Null()) const;
-
-    //! releases all inner buffers
-    void release();
-
-    int history;
-    float varThreshold;
-    float backgroundRatio;
-    float noiseSigma;
-
-private:
-    int nmixtures_;
-
-    Size frameSize_;
-    int frameType_;
-    int nframes_;
-
-    GpuMat weight_;
-    GpuMat sortKey_;
-    GpuMat mean_;
-    GpuMat var_;
-};
-
-/*!
- The class implements the following algorithm:
- "Improved adaptive Gausian mixture model for background subtraction"
- Z.Zivkovic
- International Conference Pattern Recognition, UK, August, 2004.
- http://www.zoranz.net/Publications/zivkovic2004ICPR.pdf
-*/
-class CV_EXPORTS MOG2_GPU
-{
-public:
-    //! the default constructor
-    MOG2_GPU(int nmixtures = -1);
-
-    //! re-initiaization method
-    void initialize(Size frameSize, int frameType);
-
-    //! the update operator
-    void operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate = -1.0f, Stream& stream = Stream::Null());
-
-    //! computes a background image which are the mean of all background gaussians
-    void getBackgroundImage(GpuMat& backgroundImage, Stream& stream = Stream::Null()) const;
-
-    //! releases all inner buffers
-    void release();
-
-    // parameters
-    // you should call initialize after parameters changes
-
-    int history;
-
-    //! here it is the maximum allowed number of mixture components.
-    //! Actual number is determined dynamically per pixel
-    float varThreshold;
-    // threshold on the squared Mahalanobis distance to decide if it is well described
-    // by the background model or not. Related to Cthr from the paper.
-    // This does not influence the update of the background. A typical value could be 4 sigma
-    // and that is varThreshold=4*4=16; Corresponds to Tb in the paper.
-
-    /////////////////////////
-    // less important parameters - things you might change but be carefull
-    ////////////////////////
-
-    float backgroundRatio;
-    // corresponds to fTB=1-cf from the paper
-    // TB - threshold when the component becomes significant enough to be included into
-    // the background model. It is the TB=1-cf from the paper. So I use cf=0.1 => TB=0.
-    // For alpha=0.001 it means that the mode should exist for approximately 105 frames before
-    // it is considered foreground
-    // float noiseSigma;
-    float varThresholdGen;
-
-    //correspondts to Tg - threshold on the squared Mahalan. dist. to decide
-    //when a sample is close to the existing components. If it is not close
-    //to any a new component will be generated. I use 3 sigma => Tg=3*3=9.
-    //Smaller Tg leads to more generated components and higher Tg might make
-    //lead to small number of components but they can grow too large
-    float fVarInit;
-    float fVarMin;
-    float fVarMax;
-
-    //initial variance  for the newly generated components.
-    //It will will influence the speed of adaptation. A good guess should be made.
-    //A simple way is to estimate the typical standard deviation from the images.
-    //I used here 10 as a reasonable value
-    // min and max can be used to further control the variance
-    float fCT; //CT - complexity reduction prior
-    //this is related to the number of samples needed to accept that a component
-    //actually exists. We use CT=0.05 of all the samples. By setting CT=0 you get
-    //the standard Stauffer&Grimson algorithm (maybe not exact but very similar)
-
-    //shadow detection parameters
-    bool bShadowDetection; //default 1 - do shadow detection
-    unsigned char nShadowDetection; //do shadow detection - insert this value as the detection result - 127 default value
-    float fTau;
-    // Tau - shadow threshold. The shadow is detected if the pixel is darker
-    //version of the background. Tau is a threshold on how much darker the shadow can be.
-    //Tau= 0.5 means that if pixel is more than 2 times darker then it is not shadow
-    //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003.
-
-private:
-    int nmixtures_;
-
-    Size frameSize_;
-    int frameType_;
-    int nframes_;
-
-    GpuMat weight_;
-    GpuMat variance_;
-    GpuMat mean_;
-
-    GpuMat bgmodelUsedModes_; //keep track of number of modes per pixel
-};
-
-/**
- * Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1)
- * images of the same size, where 255 indicates Foreground and 0 represents Background.
- * This class implements an algorithm described in "Visual Tracking of Human Visitors under
- * Variable-Lighting Conditions for a Responsive Audio Art Installation," A. Godbehere,
- * A. Matsukawa, K. Goldberg, American Control Conference, Montreal, June 2012.
- */
-class CV_EXPORTS GMG_GPU
-{
-public:
-    GMG_GPU();
-
-    /**
-     * Validate parameters and set up data structures for appropriate frame size.
-     * @param frameSize Input frame size
-     * @param min       Minimum value taken on by pixels in image sequence. Usually 0
-     * @param max       Maximum value taken on by pixels in image sequence. e.g. 1.0 or 255
-     */
-    void initialize(Size frameSize, float min = 0.0f, float max = 255.0f);
-
-    /**
-     * Performs single-frame background subtraction and builds up a statistical background image
-     * model.
-     * @param frame        Input frame
-     * @param fgmask       Output mask image representing foreground and background pixels
-     * @param stream       Stream for the asynchronous version
-     */
-    void operator ()(const GpuMat& frame, GpuMat& fgmask, float learningRate = -1.0f, Stream& stream = Stream::Null());
-
-    //! Releases all inner buffers
-    void release();
-
-    //! Total number of distinct colors to maintain in histogram.
-    int maxFeatures;
-
-    //! Set between 0.0 and 1.0, determines how quickly features are "forgotten" from histograms.
-    float learningRate;
-
-    //! Number of frames of video to use to initialize histograms.
-    int numInitializationFrames;
-
-    //! Number of discrete levels in each channel to be used in histograms.
-    int quantizationLevels;
-
-    //! Prior probability that any given pixel is a background pixel. A sensitivity parameter.
-    float backgroundPrior;
-
-    //! Value above which pixel is determined to be FG.
-    float decisionThreshold;
-
-    //! Smoothing radius, in pixels, for cleaning up FG image.
-    int smoothingRadius;
-
-    //! Perform background model update.
-    bool updateBackgroundModel;
-
-private:
-    float maxVal_, minVal_;
-
-    Size frameSize_;
-
-    int frameNum_;
-
-    GpuMat nfeatures_;
-    GpuMat colors_;
-    GpuMat weights_;
-
-    Ptr<FilterEngine_GPU> boxFilter_;
-    GpuMat buf_;
-};

 //! removes points (CV_32FC2, single row matrix) with zero mask value
 CV_EXPORTS void compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask);
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@@ -1,168 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::GMG_GPU::GMG_GPU() { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::initialize(cv::Size, float, float) { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, cv::gpu::Stream&) { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace bgfg_gmg
-    {
-        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
-                           float decisionThreshold, int maxFeatures, int numInitializationFrames);
-
-        template <typename SrcT>
-        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    }
-}}}
-
-cv::gpu::GMG_GPU::GMG_GPU()
-{
-    maxFeatures = 64;
-    learningRate = 0.025f;
-    numInitializationFrames = 120;
-    quantizationLevels = 16;
-    backgroundPrior = 0.8f;
-    decisionThreshold = 0.8f;
-    smoothingRadius = 7;
-    updateBackgroundModel = true;
-}
-
-void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
-{
-    using namespace cv::gpu::cudev::bgfg_gmg;
-
-    CV_Assert(min < max);
-    CV_Assert(maxFeatures > 0);
-    CV_Assert(learningRate >= 0.0f && learningRate <= 1.0f);
-    CV_Assert(numInitializationFrames >= 1);
-    CV_Assert(quantizationLevels >= 1 && quantizationLevels <= 255);
-    CV_Assert(backgroundPrior >= 0.0f && backgroundPrior <= 1.0f);
-
-    minVal_ = min;
-    maxVal_ = max;
-
-    frameSize_ = frameSize;
-
-    frameNum_ = 0;
-
-    nfeatures_.create(frameSize_, CV_32SC1);
-    colors_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32SC1);
-    weights_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32FC1);
-
-    nfeatures_.setTo(cv::Scalar::all(0));
-
-    if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
-
-    loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
-}
-
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float newLearningRate, cv::gpu::Stream& stream)
-{
-    using namespace cv::gpu::cudev::bgfg_gmg;
-
-    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    static const func_t funcs[6][4] =
-    {
-        {update_gpu<uchar>, 0, update_gpu<uchar3>, update_gpu<uchar4>},
-        {0,0,0,0},
-        {update_gpu<ushort>, 0, update_gpu<ushort3>, update_gpu<ushort4>},
-        {0,0,0,0},
-        {0,0,0,0},
-        {update_gpu<float>, 0, update_gpu<float3>, update_gpu<float4>}
-    };
-
-    CV_Assert(frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F);
-    CV_Assert(frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4);
-
-    if (newLearningRate != -1.0f)
-    {
-        CV_Assert(newLearningRate >= 0.0f && newLearningRate <= 1.0f);
-        learningRate = newLearningRate;
-    }
-
-    if (frame.size() != frameSize_)
-        initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0f);
-
-    fgmask.create(frameSize_, CV_8UC1);
-    if (stream)
-        stream.enqueueMemSet(fgmask, cv::Scalar::all(0));
-    else
-        fgmask.setTo(cv::Scalar::all(0));
-
-    funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream));
-
-    // medianBlur
-    if (smoothingRadius > 0)
-    {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
-        int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
-        double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
-        cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
-    }
-
-    // keep track of how many frames we have processed
-    ++frameNum_;
-}
-
-void cv::gpu::GMG_GPU::release()
-{
-    frameSize_ = Size();
-
-    nfeatures_.release();
-    colors_.release();
-    weights_.release();
-    boxFilter_.release();
-    buf_.release();
-}
-
-#endif
--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@@ -1,279 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::MOG_GPU::MOG_GPU(int) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, Stream&) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::release() {}
-
-cv::gpu::MOG2_GPU::MOG2_GPU(int) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::operator()(const GpuMat&, GpuMat&, float, Stream&) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mog
-    {
-        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
-                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
-                     cudaStream_t stream);
-        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
-
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-namespace mog
-{
-    const int defaultNMixtures = 5;
-    const int defaultHistory = 200;
-    const float defaultBackgroundRatio = 0.7f;
-    const float defaultVarThreshold = 2.5f * 2.5f;
-    const float defaultNoiseSigma = 30.0f * 0.5f;
-    const float defaultInitialWeight = 0.05f;
-}
-
-cv::gpu::MOG_GPU::MOG_GPU(int nmixtures) :
-    frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = std::min(nmixtures > 0 ? nmixtures : mog::defaultNMixtures, 8);
-    history = mog::defaultHistory;
-    varThreshold = mog::defaultVarThreshold;
-    backgroundRatio = mog::defaultBackgroundRatio;
-    noiseSigma = mog::defaultNoiseSigma;
-}
-
-void cv::gpu::MOG_GPU::initialize(cv::Size frameSize, int frameType)
-{
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store
-    // the mixture sort key (w/sum_of_variances), the mixture weight (w),
-    // the mean (nchannels values) and
-    // the diagonal covariance matrix (another nchannels values)
-
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    sortKey_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-    var_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    weight_.setTo(cv::Scalar::all(0));
-    sortKey_.setTo(cv::Scalar::all(0));
-    mean_.setTo(cv::Scalar::all(0));
-    var_.setTo(cv::Scalar::all(0));
-
-    nframes_ = 0;
-}
-
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float learningRate, Stream& stream)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    CV_Assert(frame.depth() == CV_8U);
-
-    int ch = frame.channels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog_gpu(frame, ch, fgmask, weight_, sortKey_, mean_, var_, nmixtures_,
-            varThreshold, learningRate, backgroundRatio, noiseSigma,
-            StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    getBackgroundImage_gpu(backgroundImage.channels(), weight_, mean_, backgroundImage, nmixtures_, backgroundRatio, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    sortKey_.release();
-    mean_.release();
-    var_.release();
-}
-
-/////////////////////////////////////////////////////////////////
-// MOG2
-
-namespace mog2
-{
-    // default parameters of gaussian background detection algorithm
-    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
-    const float defaultVarThreshold = 4.0f * 4.0f;
-    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
-    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
-    const float defaultVarThresholdGen = 3.0f * 3.0f;
-    const float defaultVarInit = 15.0f; // initial variance for new components
-    const float defaultVarMax = 5.0f * defaultVarInit;
-    const float defaultVarMin = 4.0f;
-
-    // additional parameters
-    const float defaultfCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
-    const unsigned char defaultnShadowDetection = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
-    const float defaultfTau = 0.5f; // Tau - shadow threshold, see the paper for explanation
-}
-
-cv::gpu::MOG2_GPU::MOG2_GPU(int nmixtures) :
-    frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = nmixtures > 0 ? nmixtures : mog2::defaultNMixtures;
-
-    history = mog2::defaultHistory;
-    varThreshold = mog2::defaultVarThreshold;
-    bShadowDetection = true;
-
-    backgroundRatio = mog2::defaultBackgroundRatio;
-    fVarInit = mog2::defaultVarInit;
-    fVarMax  = mog2::defaultVarMax;
-    fVarMin = mog2::defaultVarMin;
-
-    varThresholdGen = mog2::defaultVarThresholdGen;
-    fCT = mog2::defaultfCT;
-    nShadowDetection =  mog2::defaultnShadowDetection;
-    fTau = mog2::defaultfTau;
-}
-
-void cv::gpu::MOG2_GPU::initialize(cv::Size frameSize, int frameType)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-    nframes_ = 0;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store ...
-    // the mixture weight (w),
-    // the mean (nchannels values) and
-    // the covariance
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    //make the array for keeping track of the used modes per pixel - all zeros at start
-    bgmodelUsedModes_.create(frameSize_, CV_8UC1);
-    bgmodelUsedModes_.setTo(cv::Scalar::all(0));
-
-    loadConstants(nmixtures_, varThreshold, backgroundRatio, varThresholdGen, fVarInit, fVarMin, fVarMax, fTau, nShadowDetection);
-}
-
-void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate, Stream& stream)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    int ch = frame.channels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0f || frame.size() != frameSize_ || work_ch != mean_.channels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-    fgmask.setTo(cv::Scalar::all(0));
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(2 * nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_, learningRate, -learningRate * fCT, bShadowDetection, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG2_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    variance_.release();
-    mean_.release();
-
-    bgmodelUsedModes_.release();
-}
-
-#endif
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace bgfg_gmg
-    {
-        __constant__ int   c_width;
-        __constant__ int   c_height;
-        __constant__ float c_minVal;
-        __constant__ float c_maxVal;
-        __constant__ int   c_quantizationLevels;
-        __constant__ float c_backgroundPrior;
-        __constant__ float c_decisionThreshold;
-        __constant__ int   c_maxFeatures;
-        __constant__ int   c_numInitializationFrames;
-
-        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
-                           float decisionThreshold, int maxFeatures, int numInitializationFrames)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_width, &width, sizeof(width)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_height, &height, sizeof(height)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_minVal, &minVal, sizeof(minVal)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_maxVal, &maxVal, sizeof(maxVal)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_quantizationLevels, &quantizationLevels, sizeof(quantizationLevels)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_backgroundPrior, &backgroundPrior, sizeof(backgroundPrior)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_decisionThreshold, &decisionThreshold, sizeof(decisionThreshold)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_maxFeatures, &maxFeatures, sizeof(maxFeatures)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_numInitializationFrames, &numInitializationFrames, sizeof(numInitializationFrames)) );
-        }
-
-        __device__ float findFeature(const int color, const PtrStepi& colors, const PtrStepf& weights, const int x, const int y, const int nfeatures)
-        {
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-            {
-                if (color == colors(fy, x))
-                    return weights(fy, x);
-            }
-
-            // not in histogram, so return 0.
-            return 0.0f;
-        }
-
-        __device__ void normalizeHistogram(PtrStepf weights, const int x, const int y, const int nfeatures)
-        {
-            float total = 0.0f;
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                total += weights(fy, x);
-
-            if (total != 0.0f)
-            {
-                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                    weights(fy, x) /= total;
-            }
-        }
-
-        __device__ bool insertFeature(const int color, const float weight, PtrStepi colors, PtrStepf weights, const int x, const int y, int& nfeatures)
-        {
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-            {
-                if (color == colors(fy, x))
-                {
-                    // feature in histogram
-
-                    weights(fy, x) += weight;
-
-                    return false;
-                }
-            }
-
-            if (nfeatures == c_maxFeatures)
-            {
-                // discard oldest feature
-
-                int idx = -1;
-                float minVal = numeric_limits<float>::max();
-                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                {
-                    const float w = weights(fy, x);
-                    if (w < minVal)
-                    {
-                        minVal = w;
-                        idx = fy;
-                    }
-                }
-
-                colors(idx, x) = color;
-                weights(idx, x) = weight;
-
-                return false;
-            }
-
-            colors(nfeatures * c_height + y, x) = color;
-            weights(nfeatures * c_height + y, x) = weight;
-
-            ++nfeatures;
-
-            return true;
-        }
-
-        namespace detail
-        {
-            template <int cn> struct Quantization
-            {
-                template <typename T>
-                __device__ static int apply(const T& val)
-                {
-                    int res = 0;
-                    res |= static_cast<int>((val.x - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
-                    res |= static_cast<int>((val.y - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 8;
-                    res |= static_cast<int>((val.z - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 16;
-                    return res;
-                }
-            };
-
-            template <> struct Quantization<1>
-            {
-                template <typename T>
-                __device__ static int apply(T val)
-                {
-                    return static_cast<int>((val - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
-                }
-            };
-        }
-
-        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};
-
-        template <typename SrcT>
-        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
-                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= c_width || y >= c_height)
-                return;
-
-            const SrcT pix = frame(y, x);
-            const int newFeatureColor = Quantization<SrcT>::apply(pix);
-
-            int nfeatures = nfeatures_(y, x);
-
-            if (frameNum >= c_numInitializationFrames)
-            {
-                // typical operation
-
-                const float weight = findFeature(newFeatureColor, colors_, weights_, x, y, nfeatures);
-
-                // see Godbehere, Matsukawa, Goldberg (2012) for reasoning behind this implementation of Bayes rule
-                const float posterior = (weight * c_backgroundPrior) / (weight * c_backgroundPrior + (1.0f - weight) * (1.0f - c_backgroundPrior));
-
-                const bool isForeground = ((1.0f - posterior) > c_decisionThreshold);
-                fgmask(y, x) = (uchar)(-isForeground);
-
-                // update histogram.
-
-                if (updateBackgroundModel)
-                {
-                    for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                        weights_(fy, x) *= 1.0f - learningRate;
-
-                    bool inserted = insertFeature(newFeatureColor, learningRate, colors_, weights_, x, y, nfeatures);
-
-                    if (inserted)
-                    {
-                        normalizeHistogram(weights_, x, y, nfeatures);
-                        nfeatures_(y, x) = nfeatures;
-                    }
-                }
-            }
-            else if (updateBackgroundModel)
-            {
-                // training-mode update
-
-                insertFeature(newFeatureColor, 1.0f, colors_, weights_, x, y, nfeatures);
-
-                if (frameNum == c_numInitializationFrames - 1)
-                    normalizeHistogram(weights_, x, y, nfeatures);
-            }
-        }
-
-        template <typename SrcT>
-        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
-        {
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );
-
-            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-
-        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-
-        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@@ -1,764 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mog
-    {
-        ///////////////////////////////////////////////////////////////
-        // Utility
-
-        __device__ __forceinline__ float cvt(uchar val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ float3 cvt(const uchar3& val)
-        {
-            return make_float3(val.x, val.y, val.z);
-        }
-        __device__ __forceinline__ float4 cvt(const uchar4& val)
-        {
-            return make_float4(val.x, val.y, val.z, val.w);
-        }
-
-        __device__ __forceinline__ float sqr(float val)
-        {
-            return val * val;
-        }
-        __device__ __forceinline__ float sqr(const float3& val)
-        {
-            return val.x * val.x + val.y * val.y + val.z * val.z;
-        }
-        __device__ __forceinline__ float sqr(const float4& val)
-        {
-            return val.x * val.x + val.y * val.y + val.z * val.z;
-        }
-
-        __device__ __forceinline__ float sum(float val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ float sum(const float3& val)
-        {
-            return val.x + val.y + val.z;
-        }
-        __device__ __forceinline__ float sum(const float4& val)
-        {
-            return val.x + val.y + val.z;
-        }
-
-        __device__ __forceinline__ float clamp(float var, float learningRate, float diff, float minVar)
-        {
-             return ::fmaxf(var + learningRate * (diff * diff - var), minVar);
-        }
-        __device__ __forceinline__ float3 clamp(const float3& var, float learningRate, const float3& diff, float minVar)
-        {
-             return make_float3(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
-                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
-                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar));
-        }
-        __device__ __forceinline__ float4 clamp(const float4& var, float learningRate, const float4& diff, float minVar)
-        {
-             return make_float4(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
-                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
-                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar),
-                                0.0f);
-        }
-
-        template <class Ptr2D>
-        __device__ __forceinline__ void swap(Ptr2D& ptr, int x, int y, int k, int rows)
-        {
-            typename Ptr2D::elem_type val = ptr(k * rows + y, x);
-            ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
-            ptr((k + 1) * rows + y, x) = val;
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG without learning
-
-        template <typename SrcT, typename WorkT>
-        __global__ void mog_withoutLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
-                                            const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, const PtrStep<WorkT> gmm_var,
-                                            const int nmixtures, const float varThreshold, const float backgroundRatio)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            int kHit = -1;
-            int kForeground = -1;
-
-            for (int k = 0; k < nmixtures; ++k)
-            {
-                if (gmm_weight(k * frame.rows + y, x) < numeric_limits<float>::epsilon())
-                    break;
-
-                WorkT mu = gmm_mean(k * frame.rows + y, x);
-                WorkT var = gmm_var(k * frame.rows + y, x);
-
-                WorkT diff = pix - mu;
-
-                if (sqr(diff) < varThreshold * sum(var))
-                {
-                    kHit = k;
-                    break;
-                }
-            }
-
-            if (kHit >= 0)
-            {
-                float wsum = 0.0f;
-                for (int k = 0; k < nmixtures; ++k)
-                {
-                    wsum += gmm_weight(k * frame.rows + y, x);
-
-                    if (wsum > backgroundRatio)
-                    {
-                        kForeground = k + 1;
-                        break;
-                    }
-                }
-            }
-
-            fgmask(y, x) = (uchar) (-(kHit < 0 || kHit >= kForeground));
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog_withoutLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var,
-                                        int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(mog_withoutLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
-                                                                         weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
-                                                                         nmixtures, varThreshold, backgroundRatio);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG with learning
-
-        template <typename SrcT, typename WorkT>
-        __global__ void mog_withLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
-                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep<WorkT> gmm_mean, PtrStep<WorkT> gmm_var,
-                                         const int nmixtures, const float varThreshold, const float backgroundRatio, const float learningRate, const float minVar)
-        {
-            const float w0 = 0.05f;
-            const float sk0 = w0 / (30.0f * 0.5f * 2.0f);
-            const float var0 = 30.0f * 0.5f * 30.0f * 0.5f * 4.0f;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            float wsum = 0.0f;
-            int kHit = -1;
-            int kForeground = -1;
-
-            int k = 0;
-            for (; k < nmixtures; ++k)
-            {
-                float w = gmm_weight(k * frame.rows + y, x);
-                wsum += w;
-
-                if (w < numeric_limits<float>::epsilon())
-                    break;
-
-                WorkT mu = gmm_mean(k * frame.rows + y, x);
-                WorkT var = gmm_var(k * frame.rows + y, x);
-
-                WorkT diff = pix - mu;
-
-                if (sqr(diff) < varThreshold * sum(var))
-                {
-                    wsum -= w;
-                    float dw = learningRate * (1.0f - w);
-
-                    var = clamp(var, learningRate, diff, minVar);
-
-                    float sortKey_prev = w / ::sqrtf(sum(var));
-                    gmm_sortKey(k * frame.rows + y, x) = sortKey_prev;
-
-                    float weight_prev = w + dw;
-                    gmm_weight(k * frame.rows + y, x) = weight_prev;
-
-                    WorkT mean_prev = mu + learningRate * diff;
-                    gmm_mean(k * frame.rows + y, x) = mean_prev;
-
-                    WorkT var_prev = var;
-                    gmm_var(k * frame.rows + y, x) = var_prev;
-
-                    int k1 = k - 1;
-
-                    if (k1 >= 0)
-                    {
-                        float sortKey_next = gmm_sortKey(k1 * frame.rows + y, x);
-                        float weight_next = gmm_weight(k1 * frame.rows + y, x);
-                        WorkT mean_next = gmm_mean(k1 * frame.rows + y, x);
-                        WorkT var_next = gmm_var(k1 * frame.rows + y, x);
-
-                        for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
-                        {
-                            gmm_sortKey(k1 * frame.rows + y, x) = sortKey_prev;
-                            gmm_sortKey((k1 + 1) * frame.rows + y, x) = sortKey_next;
-
-                            gmm_weight(k1 * frame.rows + y, x) = weight_prev;
-                            gmm_weight((k1 + 1) * frame.rows + y, x) = weight_next;
-
-                            gmm_mean(k1 * frame.rows + y, x) = mean_prev;
-                            gmm_mean((k1 + 1) * frame.rows + y, x) = mean_next;
-
-                            gmm_var(k1 * frame.rows + y, x) = var_prev;
-                            gmm_var((k1 + 1) * frame.rows + y, x) = var_next;
-
-                            sortKey_prev = sortKey_next;
-                            sortKey_next = k1 > 0 ? gmm_sortKey((k1 - 1) * frame.rows + y, x) : 0.0f;
-
-                            weight_prev = weight_next;
-                            weight_next = k1 > 0 ? gmm_weight((k1 - 1) * frame.rows + y, x) : 0.0f;
-
-                            mean_prev = mean_next;
-                            mean_next = k1 > 0 ? gmm_mean((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
-
-                            var_prev = var_next;
-                            var_next = k1 > 0 ? gmm_var((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
-                        }
-                    }
-
-                    kHit = k1 + 1;
-                    break;
-                }
-            }
-
-            if (kHit < 0)
-            {
-                // no appropriate gaussian mixture found at all, remove the weakest mixture and create a new one
-                kHit = k = ::min(k, nmixtures - 1);
-                wsum += w0 - gmm_weight(k * frame.rows + y, x);
-
-                gmm_weight(k * frame.rows + y, x) = w0;
-                gmm_mean(k * frame.rows + y, x) = pix;
-                gmm_var(k * frame.rows + y, x) = VecTraits<WorkT>::all(var0);
-                gmm_sortKey(k * frame.rows + y, x) = sk0;
-            }
-            else
-            {
-                for( ; k < nmixtures; k++)
-                    wsum += gmm_weight(k * frame.rows + y, x);
-            }
-
-            float wscale = 1.0f / wsum;
-            wsum = 0;
-            for (k = 0; k < nmixtures; ++k)
-            {
-                float w = gmm_weight(k * frame.rows + y, x);
-                wsum += w *= wscale;
-
-                gmm_weight(k * frame.rows + y, x) = w;
-                gmm_sortKey(k * frame.rows + y, x) *= wscale;
-
-                if (wsum > backgroundRatio && kForeground < 0)
-                    kForeground = k + 1;
-            }
-
-            fgmask(y, x) = (uchar)(-(kHit >= kForeground));
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog_withLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
-                                     int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar,
-                                     cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(mog_withLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
-                                                                      weight, sortKey, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
-                                                                      nmixtures, varThreshold, backgroundRatio, learningRate, minVar);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG
-
-        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
-        {
-            typedef void (*withoutLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
-            typedef void (*withLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);
-
-            static const withoutLearning_t withoutLearning[] =
-            {
-                0, mog_withoutLearning_caller<uchar, float>, 0, mog_withoutLearning_caller<uchar3, float3>, mog_withoutLearning_caller<uchar4, float4>
-            };
-            static const withLearning_t withLearning[] =
-            {
-                0, mog_withLearning_caller<uchar, float>, 0, mog_withLearning_caller<uchar3, float3>, mog_withLearning_caller<uchar4, float4>
-            };
-
-            const float minVar = noiseSigma * noiseSigma;
-
-            if (learningRate > 0.0f)
-                withLearning[cn](frame, fgmask, weight, sortKey, mean, var, nmixtures, varThreshold, backgroundRatio, learningRate, minVar, stream);
-            else
-                withoutLearning[cn](frame, fgmask, weight, mean, var, nmixtures, varThreshold, backgroundRatio, stream);
-        }
-
-        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStepSz<OutT> dst, const int nmixtures, const float backgroundRatio)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-            float totalWeight = 0.0f;
-
-            for (int mode = 0; mode < nmixtures; ++mode)
-            {
-                float weight = gmm_weight(mode * dst.rows + y, x);
-
-                WorkT mean = gmm_mean(mode * dst.rows + y, x);
-                meanVal = meanVal + weight * mean;
-
-                totalWeight += weight;
-
-                if(totalWeight > backgroundRatio)
-                    break;
-            }
-
-            meanVal = meanVal * (1.f / totalWeight);
-
-            dst(y, x) = saturate_cast<OutT>(meanVal);
-        }
-
-        template <typename WorkT, typename OutT>
-        void getBackgroundImage_caller(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage<WorkT, OutT>, cudaFuncCachePreferL1) );
-
-            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst, nmixtures, backgroundRatio);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, getBackgroundImage_caller<float, uchar>, 0, getBackgroundImage_caller<float3, uchar3>, getBackgroundImage_caller<float4, uchar4>
-            };
-
-            funcs[cn](weight, mean, dst, nmixtures, backgroundRatio, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG2
-
-        __constant__ int           c_nmixtures;
-        __constant__ float         c_Tb;
-        __constant__ float         c_TB;
-        __constant__ float         c_Tg;
-        __constant__ float         c_varInit;
-        __constant__ float         c_varMin;
-        __constant__ float         c_varMax;
-        __constant__ float         c_tau;
-        __constant__ unsigned char c_shadowVal;
-
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
-        {
-            varMin = ::fminf(varMin, varMax);
-            varMax = ::fmaxf(varMin, varMax);
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_nmixtures, &nmixtures, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tb, &Tb, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_TB, &TB, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tg, &Tg, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varInit, &varInit, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMin, &varMin, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMax, &varMax, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_tau, &tau, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_shadowVal, &shadowVal, sizeof(unsigned char)) );
-        }
-
-        template <bool detectShadows, typename SrcT, typename WorkT>
-        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
-                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
-                             const float alphaT, const float alpha1, const float prune)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            //calculate distances to the modes (+ sort)
-            //here we need to go in descending order!!!
-
-            bool background = false; // true - the pixel classified as background
-
-            //internal:
-
-            bool fitsPDF = false; //if it remains zero a new GMM mode will be added
-
-            int nmodes = modesUsed(y, x);
-            int nNewModes = nmodes; //current number of modes in GMM
-
-            float totalWeight = 0.0f;
-
-            //go through all modes
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                //need only weight if fit is found
-                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
-
-                //fit not found yet
-                if (!fitsPDF)
-                {
-                    //check if it belongs to some of the remaining modes
-                    float var = gmm_variance(mode * frame.rows + y, x);
-
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-
-                    //calculate difference and distance
-                    WorkT diff = mean - pix;
-                    float dist2 = sqr(diff);
-
-                    //background? - Tb - usually larger than Tg
-                    if (totalWeight < c_TB && dist2 < c_Tb * var)
-                        background = true;
-
-                    //check fit
-                    if (dist2 < c_Tg * var)
-                    {
-                        //belongs to the mode
-                        fitsPDF = true;
-
-                        //update distribution
-
-                        //update weight
-                        weight += alphaT;
-                        float k = alphaT / weight;
-
-                        //update mean
-                        gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
-
-                        //update variance
-                        float varnew = var + k * (dist2 - var);
-
-                        //limit the variance
-                        varnew = ::fmaxf(varnew, c_varMin);
-                        varnew = ::fminf(varnew, c_varMax);
-
-                        gmm_variance(mode * frame.rows + y, x) = varnew;
-
-                        //sort
-                        //all other weights are at the same place and
-                        //only the matched (iModes) is higher -> just find the new place for it
-
-                        for (int i = mode; i > 0; --i)
-                        {
-                            //check one up
-                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
-                                break;
-
-                            //swap one up
-                            swap(gmm_weight, x, y, i - 1, frame.rows);
-                            swap(gmm_variance, x, y, i - 1, frame.rows);
-                            swap(gmm_mean, x, y, i - 1, frame.rows);
-                        }
-
-                        //belongs to the mode - bFitsPDF becomes 1
-                    }
-                } // !fitsPDF
-
-                //check prune
-                if (weight < -prune)
-                {
-                    weight = 0.0;
-                    nmodes--;
-                }
-
-                gmm_weight(mode * frame.rows + y, x) = weight; //update weight by the calculated value
-                totalWeight += weight;
-            }
-
-            //renormalize weights
-
-            totalWeight = 1.f / totalWeight;
-            for (int mode = 0; mode < nmodes; ++mode)
-                gmm_weight(mode * frame.rows + y, x) *= totalWeight;
-
-            nmodes = nNewModes;
-
-            //make new mode if needed and exit
-
-            if (!fitsPDF)
-            {
-                // replace the weakest or add a new one
-                int mode = nmodes == c_nmixtures ? c_nmixtures - 1 : nmodes++;
-
-                if (nmodes == 1)
-                    gmm_weight(mode * frame.rows + y, x) = 1.f;
-                else
-                {
-                    gmm_weight(mode * frame.rows + y, x) = alphaT;
-
-                    // renormalize all other weights
-
-                    for (int i = 0; i < nmodes - 1; ++i)
-                        gmm_weight(i * frame.rows + y, x) *= alpha1;
-                }
-
-                // init
-
-                gmm_mean(mode * frame.rows + y, x) = pix;
-                gmm_variance(mode * frame.rows + y, x) = c_varInit;
-
-                //sort
-                //find the new place for it
-
-                for (int i = nmodes - 1; i > 0; --i)
-                {
-                    // check one up
-                    if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
-                        break;
-
-                    //swap one up
-                    swap(gmm_weight, x, y, i - 1, frame.rows);
-                    swap(gmm_variance, x, y, i - 1, frame.rows);
-                    swap(gmm_mean, x, y, i - 1, frame.rows);
-                }
-            }
-
-            //set the number of modes
-            modesUsed(y, x) = nmodes;
-
-            bool isShadow = false;
-            if (detectShadows && !background)
-            {
-                float tWeight = 0.0f;
-
-                // check all the components  marked as background:
-                for (int mode = 0; mode < nmodes; ++mode)
-                {
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-
-                    WorkT pix_mean = pix * mean;
-
-                    float numerator = sum(pix_mean);
-                    float denominator = sqr(mean);
-
-                    // no division by zero allowed
-                    if (denominator == 0)
-                        break;
-
-                    // if tau < a < 1 then also check the color distortion
-                    if (numerator <= denominator && numerator >= c_tau * denominator)
-                    {
-                        float a = numerator / denominator;
-
-                        WorkT dD = a * mean - pix;
-
-                        if (sqr(dD) < c_Tb * gmm_variance(mode * frame.rows + y, x) * a * a)
-                        {
-                            isShadow = true;
-                            break;
-                        }
-                    };
-
-                    tWeight += gmm_weight(mode * frame.rows + y, x);
-                    if (tWeight > c_TB)
-                        break;
-                }
-            }
-
-            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            const float alpha1 = 1.0f - alphaT;
-
-            if (detectShadows)
-            {
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
-                                                                    alphaT, alpha1, prune);
-            }
-            else
-            {
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
-                                                                    alphaT, alpha1, prune);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>
-            };
-
-            funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, stream);
-        }
-
-        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= modesUsed.cols || y >= modesUsed.rows)
-                return;
-
-            int nmodes = modesUsed(y, x);
-
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-            float totalWeight = 0.0f;
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                float weight = gmm_weight(mode * modesUsed.rows + y, x);
-
-                WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
-                meanVal = meanVal + weight * mean;
-
-                totalWeight += weight;
-
-                if(totalWeight > c_TB)
-                    break;
-            }
-
-            meanVal = meanVal * (1.f / totalWeight);
-
-            dst(y, x) = saturate_cast<OutT>(meanVal);
-        }
-
-        template <typename WorkT, typename OutT>
-        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );
-
-            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>
-            };
-
-            funcs[cn](modesUsed, weight, mean, dst, stream);
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -1,801 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "fgd_bgfg_common.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace bgfg
-{
-    ////////////////////////////////////////////////////////////////////////////
-    // calcDiffHistogram
-
-    const unsigned int UINT_BITS = 32U;
-    const int LOG_WARP_SIZE = 5;
-    const int WARP_SIZE = 1 << LOG_WARP_SIZE;
-#if (__CUDA_ARCH__ < 120)
-    const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
-#endif
-
-    const int MERGE_THREADBLOCK_SIZE = 256;
-
-    __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
-    {
-        #if (__CUDA_ARCH__ < 120)
-            volatile unsigned int* s_WarpHist = s_WarpHist_;
-            unsigned int count;
-            do
-            {
-                count = s_WarpHist[data] & TAG_MASK;
-                count = threadTag | (count + 1);
-                s_WarpHist[data] = count;
-            } while (s_WarpHist[data] != count);
-        #else
-            atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
-        #endif
-    }
-
-
-    template <typename PT, typename CT>
-    __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
-    {
-#if (__CUDA_ARCH__ < 200)
-        const int HISTOGRAM_WARP_COUNT = 4;
-#else
-        const int HISTOGRAM_WARP_COUNT = 6;
-#endif
-        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
-        const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
-
-        //Per-warp subhistogram storage
-        __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
-        __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
-        __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
-
-        //Clear shared memory storage for current threadblock before processing
-        #pragma unroll
-        for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
-        {
-           s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-           s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-           s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-        }
-        __syncthreads();
-
-        const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
-
-        unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
-        unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
-        unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
-
-        const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
-        const int dataCount = prevFrame.rows * prevFrame.cols;
-        for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
-        {
-            const unsigned int y = pos / prevFrame.cols;
-            const unsigned int x = pos % prevFrame.cols;
-
-            PT prevVal = prevFrame(y, x);
-            CT curVal = curFrame(y, x);
-
-            int3 diff = make_int3(
-                ::abs(curVal.x - prevVal.x),
-                ::abs(curVal.y - prevVal.y),
-                ::abs(curVal.z - prevVal.z)
-            );
-
-            addByte(s_WarpHist0, diff.x, tag);
-            addByte(s_WarpHist1, diff.y, tag);
-            addByte(s_WarpHist2, diff.z, tag);
-        }
-        __syncthreads();
-
-        //Merge per-warp histograms into per-block and write to global memory
-        for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
-        {
-            unsigned int sum0 = 0;
-            unsigned int sum1 = 0;
-            unsigned int sum2 = 0;
-
-            #pragma unroll
-            for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
-            {
-                #if (__CUDA_ARCH__ < 120)
-                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                #else
-                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
-                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
-                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
-                #endif
-            }
-
-            partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
-            partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
-            partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
-        }
-    }
-
-    __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
-    {
-        unsigned int sum0 = 0;
-        unsigned int sum1 = 0;
-        unsigned int sum2 = 0;
-
-        #pragma unroll
-        for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
-        {
-            sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-            sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-            sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-        }
-
-        __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
-        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
-        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
-
-        plus<unsigned int> op;
-        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
-
-        if(threadIdx.x == 0)
-        {
-            hist0[blockIdx.x] = sum0;
-            hist1[blockIdx.x] = sum1;
-            hist2[blockIdx.x] = sum2;
-        }
-    }
-
-    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
-                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
-                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               bool cc20, cudaStream_t stream)
-    {
-        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
-        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
-
-        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
-                (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
-        cudaSafeCall( cudaGetLastError() );
-
-        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-
-    /////////////////////////////////////////////////////////////////////////
-    // calcDiffThreshMask
-
-    template <typename PT, typename CT>
-    __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (y > prevFrame.rows || x > prevFrame.cols)
-            return;
-
-        PT prevVal = prevFrame(y, x);
-        CT curVal = curFrame(y, x);
-
-        int3 diff = make_int3(
-            ::abs(curVal.x - prevVal.x),
-            ::abs(curVal.y - prevVal.y),
-            ::abs(curVal.z - prevVal.z)
-        );
-
-        if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
-            changeMask(y, x) = 255;
-    }
-
-    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-
-    /////////////////////////////////////////////////////////////////////////
-    // bgfgClassification
-
-    __constant__ BGPixelStat c_stat;
-
-    void setBGPixelStat(const BGPixelStat& stat)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
-    }
-
-    template <typename T> struct Output;
-    template <> struct Output<uchar3>
-    {
-        static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
-        {
-            return make_uchar3(v0, v1, v2);
-        }
-    };
-    template <> struct Output<uchar4>
-    {
-        static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
-        {
-            return make_uchar4(v0, v1, v2, 255);
-        }
-    };
-
-    template <typename PT, typename CT, typename OT>
-    __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
-                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
-                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
-    {
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (i > prevFrame.rows || j > prevFrame.cols)
-            return;
-
-        if (Fbd(i, j) || Ftd(i, j))
-        {
-            float Pb  = 0.0f;
-            float Pv  = 0.0f;
-            float Pvb = 0.0f;
-
-            int val = 0;
-
-            // Is it a motion pixel?
-            if (Ftd(i, j))
-            {
-                if (!c_stat.is_trained_dyn_model(i, j))
-                    val = 1;
-                else
-                {
-                    PT prevVal = prevFrame(i, j);
-                    CT curVal = curFrame(i, j);
-
-                    // Compare with stored CCt vectors:
-                    for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
-                    {
-                        OT v1 = c_stat.V1_CC<OT>(i, j, k);
-                        OT v2 = c_stat.V2_CC<OT>(i, j, k);
-
-                        if (::abs(v1.x - prevVal.x) <= deltaCC &&
-                            ::abs(v1.y - prevVal.y) <= deltaCC &&
-                            ::abs(v1.z - prevVal.z) <= deltaCC &&
-                            ::abs(v2.x - curVal.x) <= deltaCC &&
-                            ::abs(v2.y - curVal.y) <= deltaCC &&
-                            ::abs(v2.z - curVal.z) <= deltaCC)
-                        {
-                            Pv += c_stat.PV_CC(i, j, k);
-                            Pvb += c_stat.PVB_CC(i, j, k);
-                        }
-                    }
-
-                    Pb = c_stat.Pbcc(i, j);
-                    if (2 * Pvb * Pb <= Pv)
-                        val = 1;
-                }
-            }
-            else if(c_stat.is_trained_st_model(i, j))
-            {
-                CT curVal = curFrame(i, j);
-
-                // Compare with stored Ct vectors:
-                for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
-                {
-                    OT v = c_stat.V_C<OT>(i, j, k);
-
-                    if (::abs(v.x - curVal.x) <= deltaC &&
-                        ::abs(v.y - curVal.y) <= deltaC &&
-                        ::abs(v.z - curVal.z) <= deltaC)
-                    {
-                        Pv += c_stat.PV_C(i, j, k);
-                        Pvb += c_stat.PVB_C(i, j, k);
-                    }
-                }
-                Pb = c_stat.Pbc(i, j);
-                if (2 * Pvb * Pb <= Pv)
-                    val = 1;
-            }
-
-            // Update foreground:
-            foreground(i, j) = static_cast<uchar>(val);
-        } // end if( change detection...
-    }
-
-    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
-                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
-
-        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
-                                                                   Ftd, Fbd, foreground,
-                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // updateBackgroundModel
-
-    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
-    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
-                                          PtrStepb foreground, PtrStep<OT> background,
-                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
-    {
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (i > rows || j > cols)
-            return;
-
-        const float MIN_PV = 1e-10f;
-
-        const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
-        if (Ftd(i, j) || !is_trained_dyn_model)
-        {
-            const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
-
-            float Pbcc = c_stat.Pbcc(i, j);
-
-            //update Pb
-            Pbcc *= (1.0f - alpha);
-            if (!foreground(i, j))
-            {
-                Pbcc += alpha;
-            }
-
-            int min_dist = numeric_limits<int>::max();
-            int indx = -1;
-
-            PT prevVal = prevFrame(i, j);
-            CT curVal = curFrame(i, j);
-
-            // Find best Vi match:
-            for (int k = 0; k < N2cc; ++k)
-            {
-                float PV_CC = c_stat.PV_CC(i, j, k);
-                if (!PV_CC)
-                    break;
-
-                if (PV_CC < MIN_PV)
-                {
-                    c_stat.PV_CC(i, j, k) = 0;
-                    c_stat.PVB_CC(i, j, k) = 0;
-                    continue;
-                }
-
-                c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
-                c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
-
-                OT v1 = c_stat.V1_CC<OT>(i, j, k);
-
-                int3 val1 = make_int3(
-                    ::abs(v1.x - prevVal.x),
-                    ::abs(v1.y - prevVal.y),
-                    ::abs(v1.z - prevVal.z)
-                );
-
-                OT v2 = c_stat.V2_CC<OT>(i, j, k);
-
-                int3 val2 = make_int3(
-                    ::abs(v2.x - curVal.x),
-                    ::abs(v2.y - curVal.y),
-                    ::abs(v2.z - curVal.z)
-                );
-
-                int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
-
-                if (dist < min_dist &&
-                    val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
-                    val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
-                {
-                    min_dist = dist;
-                    indx = k;
-                }
-            }
-
-            if (indx < 0)
-            {
-                // Replace N2th elem in the table by new feature:
-                indx = N2cc - 1;
-                c_stat.PV_CC(i, j, indx) = alpha;
-                c_stat.PVB_CC(i, j, indx) = alpha;
-
-                //udate Vt
-                c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
-                c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-            else
-            {
-                // Update:
-                c_stat.PV_CC(i, j, indx) += alpha;
-
-                if (!foreground(i, j))
-                {
-                    c_stat.PVB_CC(i, j, indx) += alpha;
-                }
-            }
-
-            //re-sort CCt table by Pv
-            const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
-            const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
-            const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
-            const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
-            for (int k = 0; k < indx; ++k)
-            {
-                if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
-                {
-                    //shift elements
-                    float Pv_tmp1;
-                    float Pv_tmp2 = PV_CC_indx;
-
-                    float Pvb_tmp1;
-                    float Pvb_tmp2 = PVB_CC_indx;
-
-                    OT v1_tmp1;
-                    OT v1_tmp2 = V1_CC_indx;
-
-                    OT v2_tmp1;
-                    OT v2_tmp2 = V2_CC_indx;
-
-                    for (int l = k; l <= indx; ++l)
-                    {
-                        Pv_tmp1 = c_stat.PV_CC(i, j, l);
-                        c_stat.PV_CC(i, j, l) = Pv_tmp2;
-                        Pv_tmp2 = Pv_tmp1;
-
-                        Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
-                        c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
-                        Pvb_tmp2 = Pvb_tmp1;
-
-                        v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
-                        c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
-                        v1_tmp2 = v1_tmp1;
-
-                        v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
-                        c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
-                        v2_tmp2 = v2_tmp1;
-                    }
-
-                    break;
-                }
-            }
-
-            float sum1 = 0.0f;
-            float sum2 = 0.0f;
-
-            //check "once-off" changes
-            for (int k = 0; k < N1cc; ++k)
-            {
-                const float PV_CC = c_stat.PV_CC(i, j, k);
-                if (!PV_CC)
-                    break;
-
-                sum1 += PV_CC;
-                sum2 += c_stat.PVB_CC(i, j, k);
-            }
-
-            if (sum1 > T)
-                c_stat.is_trained_dyn_model(i, j) = 1;
-
-            float diff = sum1 - Pbcc * sum2;
-
-            // Update stat table:
-            if (diff > T)
-            {
-                //new BG features are discovered
-                for (int k = 0; k < N1cc; ++k)
-                {
-                    const float PV_CC = c_stat.PV_CC(i, j, k);
-                    if (!PV_CC)
-                        break;
-
-                    c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
-                }
-            }
-
-            c_stat.Pbcc(i, j) = Pbcc;
-        }
-
-        // Handle "stationary" pixel:
-        if (!Ftd(i, j))
-        {
-            const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
-
-            float Pbc = c_stat.Pbc(i, j);
-
-            //update Pb
-            Pbc *= (1.0f - alpha);
-            if (!foreground(i, j))
-            {
-                Pbc += alpha;
-            }
-
-            int min_dist = numeric_limits<int>::max();
-            int indx = -1;
-
-            CT curVal = curFrame(i, j);
-
-            //find best Vi match
-            for (int k = 0; k < N2c; ++k)
-            {
-                float PV_C = c_stat.PV_C(i, j, k);
-
-                if (PV_C < MIN_PV)
-                {
-                    c_stat.PV_C(i, j, k) = 0;
-                    c_stat.PVB_C(i, j, k) = 0;
-                    continue;
-                }
-
-                // Exponential decay of memory
-                c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
-                c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
-
-                OT v = c_stat.V_C<OT>(i, j, k);
-                int3 val = make_int3(
-                    ::abs(v.x - curVal.x),
-                    ::abs(v.y - curVal.y),
-                    ::abs(v.z - curVal.z)
-                );
-
-                int dist = val.x + val.y + val.z;
-
-                if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
-                {
-                    min_dist = dist;
-                    indx = k;
-                }
-            }
-
-            if (indx < 0)
-            {
-                //N2th elem in the table is replaced by a new features
-                indx = N2c - 1;
-
-                c_stat.PV_C(i, j, indx) = alpha;
-                c_stat.PVB_C(i, j, indx) = alpha;
-
-                //udate Vt
-                c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-            else
-            {
-                //update
-                c_stat.PV_C(i, j, indx) += alpha;
-
-                if (!foreground(i, j))
-                {
-                    c_stat.PVB_C(i, j, indx) += alpha;
-                }
-            }
-
-            //re-sort Ct table by Pv
-            const float PV_C_indx = c_stat.PV_C(i, j, indx);
-            const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
-            OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
-            for (int k = 0; k < indx; ++k)
-            {
-                if (c_stat.PV_C(i, j, k) <= PV_C_indx)
-                {
-                    //shift elements
-                    float Pv_tmp1;
-                    float Pv_tmp2 = PV_C_indx;
-
-                    float Pvb_tmp1;
-                    float Pvb_tmp2 = PVB_C_indx;
-
-                    OT v_tmp1;
-                    OT v_tmp2 = V_C_indx;
-
-                    for (int l = k; l <= indx; ++l)
-                    {
-                        Pv_tmp1 = c_stat.PV_C(i, j, l);
-                        c_stat.PV_C(i, j, l) = Pv_tmp2;
-                        Pv_tmp2 = Pv_tmp1;
-
-                        Pvb_tmp1 = c_stat.PVB_C(i, j, l);
-                        c_stat.PVB_C(i, j, l) = Pvb_tmp2;
-                        Pvb_tmp2 = Pvb_tmp1;
-
-                        v_tmp1 = c_stat.V_C<OT>(i, j, l);
-                        c_stat.V_C<OT>(i, j, l) = v_tmp2;
-                        v_tmp2 = v_tmp1;
-                    }
-
-                    break;
-                }
-            }
-
-            // Check "once-off" changes:
-            float sum1 = 0.0f;
-            float sum2 = 0.0f;
-            for (int k = 0; k < N1c; ++k)
-            {
-                const float PV_C = c_stat.PV_C(i, j, k);
-                if (!PV_C)
-                    break;
-
-                sum1 += PV_C;
-                sum2 += c_stat.PVB_C(i, j, k);
-            }
-
-            if (sum1 > T)
-                c_stat.is_trained_st_model(i, j) = 1;
-
-            float diff = sum1 - Pbc * sum2;
-
-            // Update stat table:
-            if (diff > T)
-            {
-                //new BG features are discovered
-                for (int k = 0; k < N1c; ++k)
-                {
-                    const float PV_C = c_stat.PV_C(i, j, k);
-                    if (!PV_C)
-                        break;
-
-                    c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
-                }
-
-                c_stat.Pbc(i, j) = 1.0f - Pbc;
-            }
-            else
-            {
-                c_stat.Pbc(i, j) = Pbc;
-            }
-        } // if !(change detection) at pixel (i,j)
-
-        // Update the reference BG image:
-        if (!foreground(i, j))
-        {
-            CT curVal = curFrame(i, j);
-
-            if (!Ftd(i, j) && !Fbd(i, j))
-            {
-                // Apply IIR filter:
-                OT oldVal = background(i, j);
-
-                int3 newVal = make_int3(
-                    __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
-                    __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
-                    __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
-                );
-
-                background(i, j) = Output<OT>::make(
-                    static_cast<uchar>(newVal.x),
-                    static_cast<uchar>(newVal.y),
-                    static_cast<uchar>(newVal.z)
-                );
-            }
-            else
-            {
-                background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-        }
-    }
-
-    template <typename PT, typename CT, typename OT>
-    struct UpdateBackgroundModel
-    {
-        static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
-                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                         cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
-
-            updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
-                prevFrame.cols, prevFrame.rows,
-                prevFrame, curFrame,
-                Ftd, Fbd, foreground, background,
-                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
-                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                                   cudaStream_t stream)
-    {
-        UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
-                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
-    }
-
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -1,189 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __FGD_BGFG_COMMON_HPP__
-#define __FGD_BGFG_COMMON_HPP__
-
-#include "opencv2/core/cuda_devptrs.hpp"
-
-namespace bgfg
-{
-    struct BGPixelStat
-    {
-    public:
-#ifdef __CUDACC__
-        __device__ float& Pbc(int i, int j);
-        __device__ float& Pbcc(int i, int j);
-
-        __device__ unsigned char& is_trained_st_model(int i, int j);
-        __device__ unsigned char& is_trained_dyn_model(int i, int j);
-
-        __device__ float& PV_C(int i, int j, int k);
-        __device__ float& PVB_C(int i, int j, int k);
-        template <typename T> __device__ T& V_C(int i, int j, int k);
-
-        __device__ float& PV_CC(int i, int j, int k);
-        __device__ float& PVB_CC(int i, int j, int k);
-        template <typename T> __device__ T& V1_CC(int i, int j, int k);
-        template <typename T> __device__ T& V2_CC(int i, int j, int k);
-#endif
-
-        int rows_;
-
-        unsigned char* Pbc_data_;
-        size_t Pbc_step_;
-
-        unsigned char* Pbcc_data_;
-        size_t Pbcc_step_;
-
-        unsigned char* is_trained_st_model_data_;
-        size_t is_trained_st_model_step_;
-
-        unsigned char* is_trained_dyn_model_data_;
-        size_t is_trained_dyn_model_step_;
-
-        unsigned char* ctable_Pv_data_;
-        size_t ctable_Pv_step_;
-
-        unsigned char* ctable_Pvb_data_;
-        size_t ctable_Pvb_step_;
-
-        unsigned char* ctable_v_data_;
-        size_t ctable_v_step_;
-
-        unsigned char* cctable_Pv_data_;
-        size_t cctable_Pv_step_;
-
-        unsigned char* cctable_Pvb_data_;
-        size_t cctable_Pvb_step_;
-
-        unsigned char* cctable_v1_data_;
-        size_t cctable_v1_step_;
-
-        unsigned char* cctable_v2_data_;
-        size_t cctable_v2_step_;
-    };
-
-#ifdef __CUDACC__
-    __device__ __forceinline__ float& BGPixelStat::Pbc(int i, int j)
-    {
-        return *((float*)(Pbc_data_ + i * Pbc_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::Pbcc(int i, int j)
-    {
-        return *((float*)(Pbcc_data_ + i * Pbcc_step_) + j);
-    }
-
-    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_st_model(int i, int j)
-    {
-        return *((unsigned char*)(is_trained_st_model_data_ + i * is_trained_st_model_step_) + j);
-    }
-
-    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_dyn_model(int i, int j)
-    {
-        return *((unsigned char*)(is_trained_dyn_model_data_ + i * is_trained_dyn_model_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PV_C(int i, int j, int k)
-    {
-        return *((float*)(ctable_Pv_data_ + ((k * rows_) + i) * ctable_Pv_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PVB_C(int i, int j, int k)
-    {
-        return *((float*)(ctable_Pvb_data_ + ((k * rows_) + i) * ctable_Pvb_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V_C(int i, int j, int k)
-    {
-        return *((T*)(ctable_v_data_ + ((k * rows_) + i) * ctable_v_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PV_CC(int i, int j, int k)
-    {
-        return *((float*)(cctable_Pv_data_ + ((k * rows_) + i) * cctable_Pv_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PVB_CC(int i, int j, int k)
-    {
-        return *((float*)(cctable_Pvb_data_ + ((k * rows_) + i) * cctable_Pvb_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V1_CC(int i, int j, int k)
-    {
-        return *((T*)(cctable_v1_data_ + ((k * rows_) + i) * cctable_v1_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V2_CC(int i, int j, int k)
-    {
-        return *((T*)(cctable_v2_data_ + ((k * rows_) + i) * cctable_v2_step_) + j);
-    }
-#endif
-
-    const int PARTIAL_HISTOGRAM_COUNT = 240;
-    const int HISTOGRAM_BIN_COUNT = 256;
-
-    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
-                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               bool cc20, cudaStream_t stream);
-
-    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
-
-    void setBGPixelStat(const BGPixelStat& stat);
-
-    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
-                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-
-    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                   cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
-                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                                   cudaStream_t stream);
-}
-
-#endif // __FGD_BGFG_COMMON_HPP__
--- a/modules/gpu/src/cuda/optflowbm.cu
+++ b/modules/gpu/src/cuda/optflowbm.cu
@@ -1,414 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace optflowbm
-{
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
-    {
-        int s = 0;
-
-        for (int y = 0; y < blockSize.y; ++y)
-        {
-            for (int x = 0; x < blockSize.x; ++x)
-                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
-        }
-
-        return s;
-    }
-
-    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
-                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
-                                  const short2* ss, const int ssCount)
-    {
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (i >= velx.rows || j >= velx.cols)
-            return;
-
-        const int X1 = j * shiftSize.x;
-        const int Y1 = i * shiftSize.y;
-
-        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
-        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
-
-        int X2 = X1 + offX;
-        int Y2 = Y1 + offY;
-
-        int dist = numeric_limits<int>::max();
-
-        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
-
-        int countMin = 1;
-        int sumx = offX;
-        int sumy = offY;
-
-        if (dist > acceptLevel)
-        {
-            // do brute-force search
-            for (int k = 0; k < ssCount; ++k)
-            {
-                const short2 ssVal = ss[k];
-
-                const int dx = offX + ssVal.x;
-                const int dy = offY + ssVal.y;
-
-                X2 = X1 + dx;
-                Y2 = Y1 + dy;
-
-                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-                {
-                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
-                    if (tmpDist < acceptLevel)
-                    {
-                        sumx = dx;
-                        sumy = dy;
-                        countMin = 1;
-                        break;
-                    }
-
-                    if (tmpDist < dist)
-                    {
-                        dist = tmpDist;
-                        sumx = dx;
-                        sumy = dy;
-                        countMin = 1;
-                    }
-                    else if (tmpDist == dist)
-                    {
-                        sumx += dx;
-                        sumy += dy;
-                        countMin++;
-                    }
-                }
-            }
-
-            if (dist > escapeLevel)
-            {
-                sumx = offX;
-                sumy = offY;
-                countMin = 1;
-            }
-        }
-
-        velx(i, j) = static_cast<float>(sumx) / countMin;
-        vely(i, j) = static_cast<float>(sumy) / countMin;
-    }
-
-    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
-              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
-    {
-        bindTexture(&tex_prev, prev);
-        bindTexture(&tex_curr, curr);
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
-
-        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
-                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-/////////////////////////////////////////////////////////
-// Fast approximate version
-
-namespace optflowbm_fast
-{
-    enum
-    {
-        CTA_SIZE = 128,
-
-        TILE_COLS = 128,
-        TILE_ROWS = 32,
-
-        STRIDE = CTA_SIZE
-    };
-
-    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
-    {
-        return ::abs(a - b);
-    }
-
-    template <class T> struct FastOptFlowBM
-    {
-
-        int search_radius;
-        int block_radius;
-
-        int search_window;
-        int block_window;
-
-        PtrStepSz<T> I0;
-        PtrStep<T> I1;
-
-        mutable PtrStepi buffer;
-
-        FastOptFlowBM(int search_window_, int block_window_,
-                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
-                      PtrStepi buffer_) :
-            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
-            search_window(search_window_), block_window(block_window_),
-            I0(I0_), I1(I1_),
-            buffer(buffer_)
-        {
-        }
-
-        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                dist_sums[index] = 0;
-
-                for (int tx = 0; tx < block_window; ++tx)
-                    col_sums(tx, index) = 0;
-
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int ay = i;
-                int ax = j;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius;
-
-                for (int tx = -block_radius; tx <= block_radius; ++tx)
-                {
-                    int col_sum = 0;
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                    {
-                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
-
-                        dist_sums[index] += dist;
-                        col_sum += dist;
-                    }
-
-                    col_sums(tx + block_radius, index) = col_sum;
-                }
-
-                up_col_sums(j, index) = col_sums(block_window - 1, index);
-            }
-        }
-
-        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int ay = i;
-                int ax = j + block_radius;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius + block_radius;
-
-                int col_sum = 0;
-
-                for (int ty = -block_radius; ty <= block_radius; ++ty)
-                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
-
-                dist_sums[index] += col_sum - col_sums(first, index);
-
-                col_sums(first, index) = col_sum;
-                up_col_sums(j, index) = col_sum;
-            }
-        }
-
-        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            int ay = i;
-            int ax = j + block_radius;
-
-            T a_up   = I0(ay - block_radius - 1, ax);
-            T a_down = I0(ay + block_radius, ax);
-
-            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius + block_radius;
-
-                T b_up   = I1(by - block_radius - 1, bx);
-                T b_down = I1(by + block_radius, bx);
-
-                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-
-                dist_sums[index] += col_sum  - col_sums(first, index);
-                col_sums(first, index) = col_sum;
-                up_col_sums(j, index) = col_sum;
-            }
-        }
-
-        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
-        {
-            int bestDist = numeric_limits<int>::max();
-            int bestInd = -1;
-
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int curDist = dist_sums[index];
-                if (curDist < bestDist)
-                {
-                    bestDist = curDist;
-                    bestInd = index;
-                }
-            }
-
-            __shared__ int cta_dist_buffer[CTA_SIZE];
-            __shared__ int cta_ind_buffer[CTA_SIZE];
-
-            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
-
-            if (threadIdx.x == 0)
-            {
-                int y = bestInd / search_window;
-                int x = bestInd - y * search_window;
-
-                velx = x - search_radius;
-                vely = y - search_radius;
-            }
-        }
-
-        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
-        {
-            int tbx = blockIdx.x * TILE_COLS;
-            int tby = blockIdx.y * TILE_ROWS;
-
-            int tex = ::min(tbx + TILE_COLS, I0.cols);
-            int tey = ::min(tby + TILE_ROWS, I0.rows);
-
-            PtrStepi col_sums;
-            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
-            col_sums.step = buffer.step;
-
-            PtrStepi up_col_sums;
-            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
-            up_col_sums.step = buffer.step;
-
-            extern __shared__ int dist_sums[]; //search_window * search_window
-
-            int first = 0;
-
-            for (int i = tby; i < tey; ++i)
-            {
-                for (int j = tbx; j < tex; ++j)
-                {
-                    __syncthreads();
-
-                    if (j == tbx)
-                    {
-                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
-                        first = 0;
-                    }
-                    else
-                    {
-                        if (i == tby)
-                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
-                        else
-                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
-
-                        first = (first + 1) % block_window;
-                    }
-
-                    __syncthreads();
-
-                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
-                }
-            }
-        }
-
-    };
-
-    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
-    {
-        fbm(velx, vely);
-    }
-
-    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-    {
-        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
-
-        buffer_cols = search_window * search_window * grid.y;
-        buffer_rows = src_cols + block_window * grid.x;
-    }
-
-    template <typename T>
-    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
-    {
-        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
-
-        dim3 block(CTA_SIZE, 1);
-        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
-
-        size_t smem = search_window * search_window * sizeof(int);
-
-        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
-        cudaSafeCall ( cudaGetLastError () );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
-}
-
-#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -1,220 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace optical_flow
-    {
-        #define NEEDLE_MAP_SCALE 16
-        #define NUM_VERTS_PER_ARROW 6
-
-        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {
-            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
-
-            volatile float* u_col_sum = smem;
-            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
-
-            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
-            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
-
-            u_col_sum[threadIdx.x] = 0;
-            v_col_sum[threadIdx.x] = 0;
-
-            #pragma unroll
-            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
-            {
-                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
-                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
-            }
-
-            if (threadIdx.x < 8)
-            {
-                // now add the column sums
-                const uint X = threadIdx.x;
-
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
-                }
-
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
-                }
-
-                if (X | 0xf8 == 0xf8)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
-                }
-
-                if (X == 0)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
-                }
-            }
-
-            if (threadIdx.x == 0)
-            {
-                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
-
-                u_col_sum[0] *= coeff;
-                v_col_sum[0] *= coeff;
-
-                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
-                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
-            }
-        }
-
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
-        {
-            const dim3 block(NEEDLE_MAP_SCALE);
-            const dim3 grid(u_avg.cols, u_avg.rows);
-
-            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            // test - just draw a triangle at each pixel
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-
-            float3 v[NUM_VERTS_PER_ARROW];
-
-            if (x < u_avg.cols && y < u_avg.rows)
-            {
-                const float u_avg_val = u_avg(y, x);
-                const float v_avg_val = v_avg(y, x);
-
-                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
-
-                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
-                r = fmin(14.0f * (r / max_flow), 14.0f);
-
-                v[0].z = 1.0f;
-                v[1].z = 0.7f;
-                v[2].z = 0.7f;
-                v[3].z = 0.7f;
-                v[4].z = 0.7f;
-                v[5].z = 1.0f;
-
-                v[0].x = arrow_x;
-                v[0].y = arrow_y;
-                v[5].x = arrow_x;
-                v[5].y = arrow_y;
-
-                v[2].x = arrow_x + r * ::cosf(theta);
-                v[2].y = arrow_y + r * ::sinf(theta);
-                v[3].x = v[2].x;
-                v[3].y = v[2].y;
-
-                r = ::fmin(r, 2.5f);
-
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
-
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
-
-                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[0].x * xscale;
-                vertex_data[indx++] = v[0].y * yscale;
-                vertex_data[indx++] = v[0].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[1].x * xscale;
-                vertex_data[indx++] = v[1].y * yscale;
-                vertex_data[indx++] = v[1].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[2].x * xscale;
-                vertex_data[indx++] = v[2].y * yscale;
-                vertex_data[indx++] = v[2].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[3].x * xscale;
-                vertex_data[indx++] = v[3].y * yscale;
-                vertex_data[indx++] = v[3].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[4].x * xscale;
-                vertex_data[indx++] = v[4].y * yscale;
-                vertex_data[indx++] = v[4].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[5].x * xscale;
-                vertex_data[indx++] = v[5].y * yscale;
-                vertex_data[indx++] = v[5].z;
-            }
-        }
-
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            const dim3 block(16);
-            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
-
-            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -1,647 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-#define tx threadIdx.x
-#define ty threadIdx.y
-#define bx blockIdx.x
-#define by blockIdx.y
-#define bdx blockDim.x
-#define bdy blockDim.y
-
-#define BORDER_SIZE 5
-#define MAX_KSIZE_HALF 100
-
-namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-{
-    __constant__ float c_g[8];
-    __constant__ float c_xg[8];
-    __constant__ float c_xxg[8];
-    __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
-
-
-    template <int polyN>
-    __global__ void polynomialExpansion(
-            const int height, const int width, const PtrStepf src, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * (bdx - 2*polyN) + tx - polyN;
-
-        if (y < height)
-        {
-            extern __shared__ float smem[];
-            volatile float *row = smem + tx;
-            int xWarped = ::min(::max(x, 0), width - 1);
-
-            row[0] = src(y, xWarped) * c_g[0];
-            row[bdx] = 0.f;
-            row[2*bdx] = 0.f;
-
-            for (int k = 1; k <= polyN; ++k)
-            {
-                float t0 = src(::max(y - k, 0), xWarped);
-                float t1 = src(::min(y + k, height - 1), xWarped);
-
-                row[0] += c_g[k] * (t0 + t1);
-                row[bdx] += c_xg[k] * (t1 - t0);
-                row[2*bdx] += c_xxg[k] * (t0 + t1);
-            }
-
-            __syncthreads();
-
-            if (tx >= polyN && tx + polyN < bdx && x < width)
-            {
-                float b1 = c_g[0] * row[0];
-                float b3 = c_g[0] * row[bdx];
-                float b5 = c_g[0] * row[2*bdx];
-                float b2 = 0, b4 = 0, b6 = 0;
-
-                for (int k = 1; k <= polyN; ++k)
-                {
-                    b1 += (row[k] + row[-k]) * c_g[k];
-                    b4 += (row[k] + row[-k]) * c_xxg[k];
-                    b2 += (row[k] - row[-k]) * c_xg[k];
-                    b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
-                    b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
-                    b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
-                }
-
-                dst(y, xWarped) = b3*c_ig11;
-                dst(height + y, xWarped) = b2*c_ig11;
-                dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
-                dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
-                dst(4*height + y, xWarped) = b6*c_ig55;
-            }
-        }
-    }
-
-
-    void setPolynomialExpansionConsts(
-            int polyN, const float *g, const float *xg, const float *xxg,
-            float ig11, float ig03, float ig33, float ig55)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
-    }
-
-
-    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
-    {
-        dim3 block(256);
-        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
-        int smem = 3 * block.x * sizeof(float);
-
-        if (polyN == 5)
-            polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
-        else if (polyN == 7)
-            polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __constant__ float c_border[BORDER_SIZE + 1];
-
-    __global__ void updateMatrices(
-            const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
-            const PtrStepf R0, const PtrStepf R1, PtrStepf M)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        if (y < height && x < width)
-        {
-            float dx = flowx(y, x);
-            float dy = flowy(y, x);
-            float fx = x + dx;
-            float fy = y + dy;
-
-            int x1 = floorf(fx);
-            int y1 = floorf(fy);
-            fx -= x1; fy -= y1;
-
-            float r2, r3, r4, r5, r6;
-
-            if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
-            {
-                float a00 = (1.f - fx) * (1.f - fy);
-                float a01 = fx * (1.f - fy);
-                float a10 = (1.f - fx) * fy;
-                float a11 = fx * fy;
-
-                r2 = a00 * R1(y1, x1) +
-                     a01 * R1(y1, x1 + 1) +
-                     a10 * R1(y1 + 1, x1) +
-                     a11 * R1(y1 + 1, x1 + 1);
-
-                r3 = a00 * R1(height + y1, x1) +
-                     a01 * R1(height + y1, x1 + 1) +
-                     a10 * R1(height + y1 + 1, x1) +
-                     a11 * R1(height + y1 + 1, x1 + 1);
-
-                r4 = a00 * R1(2*height + y1, x1) +
-                     a01 * R1(2*height + y1, x1 + 1) +
-                     a10 * R1(2*height + y1 + 1, x1) +
-                     a11 * R1(2*height + y1 + 1, x1 + 1);
-
-                r5 = a00 * R1(3*height + y1, x1) +
-                     a01 * R1(3*height + y1, x1 + 1) +
-                     a10 * R1(3*height + y1 + 1, x1) +
-                     a11 * R1(3*height + y1 + 1, x1 + 1);
-
-                r6 = a00 * R1(4*height + y1, x1) +
-                     a01 * R1(4*height + y1, x1 + 1) +
-                     a10 * R1(4*height + y1 + 1, x1) +
-                     a11 * R1(4*height + y1 + 1, x1 + 1);
-
-                r4 = (R0(2*height + y, x) + r4) * 0.5f;
-                r5 = (R0(3*height + y, x) + r5) * 0.5f;
-                r6 = (R0(4*height + y, x) + r6) * 0.25f;
-            }
-            else
-            {
-                r2 = r3 = 0.f;
-                r4 = R0(2*height + y, x);
-                r5 = R0(3*height + y, x);
-                r6 = R0(4*height + y, x) * 0.5f;
-            }
-
-            r2 = (R0(y, x) - r2) * 0.5f;
-            r3 = (R0(height + y, x) - r3) * 0.5f;
-
-            r2 += r4*dy + r6*dx;
-            r3 += r6*dy + r5*dx;
-
-            float scale =
-                    c_border[::min(x, BORDER_SIZE)] *
-                    c_border[::min(y, BORDER_SIZE)] *
-                    c_border[::min(width - x - 1, BORDER_SIZE)] *
-                    c_border[::min(height - y - 1, BORDER_SIZE)];
-
-            r2 *= scale; r3 *= scale; r4 *= scale;
-            r5 *= scale; r6 *= scale;
-
-            M(y, x) = r4*r4 + r6*r6;
-            M(height + y, x) = (r4 + r5)*r6;
-            M(2*height + y, x) = r5*r5 + r6*r6;
-            M(3*height + y, x) = r4*r2 + r6*r3;
-            M(4*height + y, x) = r6*r2 + r5*r3;
-        }
-    }
-
-
-    void setUpdateMatricesConsts()
-    {
-        static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
-        cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
-    }
-
-
-    void updateMatricesGpu(
-            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
-            PtrStepSzf M, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
-
-        updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __global__ void updateFlow(
-            const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        if (y < height && x < width)
-        {
-            float g11 = M(y, x);
-            float g12 = M(height + y, x);
-            float g22 = M(2*height + y, x);
-            float h1 = M(3*height + y, x);
-            float h2 = M(4*height + y, x);
-
-            float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
-
-            flowx(y, x) = (g11*h2 - g12*h1) * detInv;
-            flowy(y, x) = (g22*h1 - g12*h2) * detInv;
-        }
-    }
-
-
-    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
-
-        updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    /*__global__ void boxFilter(
-            const int height, const int width, const PtrStepf src,
-            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = ::min(::max(xExt, 0), width - 1);
-
-                row[i] = src(y, xExt);
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal passs
-                row += tx + ksizeHalf;
-                float res = row[0];
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    res += row[-i] + row[i];
-                dst(y, x) = res * boxAreaInv;
-            }
-        }
-    }
-
-
-    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        dim3 block(256);
-        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }*/
-
-
-    __global__ void boxFilter5(
-            const int height, const int width, const PtrStepf src,
-            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-
-        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-        volatile float *row = smem + 5 * ty * smw;
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = ::min(::max(xExt, 0), width - 1);
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] = src(k*height + y, xExt);
-
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        row[k*smw + i] +=
-                                src(k*height + ::max(y - j, 0), xExt) +
-                                src(k*height + ::min(y + j, height - 1), xExt);
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal passs
-
-                row += tx + ksizeHalf;
-                float res[5];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    res[k] = row[k*smw];
-
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        res[k] += row[k*smw - i] + row[k*smw + i];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    dst(k*height + y, x) = res[k] * boxAreaInv;
-            }
-        }
-    }
-
-
-    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(256);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(128);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
-
-    template <typename Border>
-    __global__ void gaussianBlur(
-            const int height, const int width, const PtrStepf src, const int ksizeHalf,
-            const Border b, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = b.idx_col(xExt);
-                row[i] = src(y, xExt) * c_gKer[0];
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    row[i] +=
-                            (src(b.idx_row_low(y - j), xExt) +
-                             src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal pass
-                row += tx + ksizeHalf;
-                float res = row[0] * c_gKer[0];
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    res += (row[-i] + row[i]) * c_gKer[i];
-                dst(y, x) = res;
-            }
-        }
-    }
-
-
-    void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
-    }
-
-
-    template <typename Border>
-    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows;
-        int width = src.cols;
-
-        dim3 block(256);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
-        Border b(height, width);
-
-        gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void gaussianBlurGpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlurCaller<BrdReflect101<float> >,
-            gaussianBlurCaller<BrdReplicate<float> >,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-
-    template <typename Border>
-    __global__ void gaussianBlur5(
-            const int height, const int width, const PtrStepf src, const int ksizeHalf,
-            const Border b, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-
-        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-        volatile float *row = smem + 5 * ty * smw;
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = b.idx_col(xExt);
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
-
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        row[k*smw + i] +=
-                                (src(k*height + b.idx_row_low(y - j), xExt) +
-                                 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal pass
-
-                row += tx + ksizeHalf;
-                float res[5];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    res[k] = row[k*smw] * c_gKer[0];
-
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    dst(k*height + y, x) = res[k];
-            }
-        }
-    }
-
-
-    template <typename Border, int blockDimX>
-    void gaussianBlur5Caller(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(blockDimX);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-        Border b(height, width);
-
-        gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void gaussianBlur5Gpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlur5Caller<BrdReflect101<float>,256>,
-            gaussianBlur5Caller<BrdReplicate<float>,256>,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-    void gaussianBlur5Gpu_CC11(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlur5Caller<BrdReflect101<float>,128>,
-            gaussianBlur5Caller<BrdReplicate<float>,128>,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -1,560 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace pyrlk
-{
-    __constant__ int c_winSize_x;
-    __constant__ int c_winSize_y;
-    __constant__ int c_halfWin_x;
-    __constant__ int c_halfWin_y;
-    __constant__ int c_iters;
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    template <int cn> struct Tex_I;
-    template <> struct Tex_I<1>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_If, x, y);
-        }
-    };
-    template <> struct Tex_I<4>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_If4, x, y);
-        }
-    };
-
-    template <int cn> struct Tex_J;
-    template <> struct Tex_J<1>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_Jf, x, y);
-        }
-    };
-    template <> struct Tex_J<4>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_Jf4, x, y);
-        }
-    };
-
-    __device__ __forceinline__ void accum(float& dst, float val)
-    {
-        dst += val;
-    }
-    __device__ __forceinline__ void accum(float& dst, const float4& val)
-    {
-        dst += val.x + val.y + val.z;
-    }
-
-    __device__ __forceinline__ float abs_(float a)
-    {
-        return ::fabsf(a);
-    }
-    __device__ __forceinline__ float4 abs_(const float4& a)
-    {
-        return abs(a);
-    }
-
-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
-    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
-    {
-    #if __CUDA_ARCH__ <= 110
-        const int BLOCK_SIZE = 128;
-    #else
-        const int BLOCK_SIZE = 256;
-    #endif
-
-        __shared__ float smem1[BLOCK_SIZE];
-        __shared__ float smem2[BLOCK_SIZE];
-        __shared__ float smem3[BLOCK_SIZE];
-
-        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        float2 prevPt = prevPts[blockIdx.x];
-        prevPt.x *= (1.0f / (1 << level));
-        prevPt.y *= (1.0f / (1 << level));
-
-        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
-        {
-            if (tid == 0 && level == 0)
-                status[blockIdx.x] = 0;
-
-            return;
-        }
-
-        prevPt.x -= c_halfWin_x;
-        prevPt.y -= c_halfWin_y;
-
-        // extract the patch from the first image, compute covariation matrix of derivatives
-
-        float A11 = 0;
-        float A12 = 0;
-        float A22 = 0;
-
-        typedef typename TypeVec<float, cn>::vec_type work_type;
-
-        work_type I_patch   [PATCH_Y][PATCH_X];
-        work_type dIdx_patch[PATCH_Y][PATCH_X];
-        work_type dIdy_patch[PATCH_Y][PATCH_X];
-
-        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
-        {
-            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
-            {
-                float x = prevPt.x + xBase + 0.5f;
-                float y = prevPt.y + yBase + 0.5f;
-
-                I_patch[i][j] = Tex_I<cn>::read(x, y);
-
-                // Sharr Deriv
-
-                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
-
-                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
-
-                dIdx_patch[i][j] = dIdx;
-                dIdy_patch[i][j] = dIdy;
-
-                accum(A11, dIdx * dIdx);
-                accum(A12, dIdx * dIdy);
-                accum(A22, dIdy * dIdy);
-            }
-        }
-
-        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
-
-    #if __CUDA_ARCH__ >= 300
-        if (tid == 0)
-        {
-            smem1[0] = A11;
-            smem2[0] = A12;
-            smem3[0] = A22;
-        }
-    #endif
-
-        __syncthreads();
-
-        A11 = smem1[0];
-        A12 = smem2[0];
-        A22 = smem3[0];
-
-        float D = A11 * A22 - A12 * A12;
-
-        if (D < numeric_limits<float>::epsilon())
-        {
-            if (tid == 0 && level == 0)
-                status[blockIdx.x] = 0;
-
-            return;
-        }
-
-        D = 1.f / D;
-
-        A11 *= D;
-        A12 *= D;
-        A22 *= D;
-
-        float2 nextPt = nextPts[blockIdx.x];
-        nextPt.x *= 2.f;
-        nextPt.y *= 2.f;
-
-        nextPt.x -= c_halfWin_x;
-        nextPt.y -= c_halfWin_y;
-
-        for (int k = 0; k < c_iters; ++k)
-        {
-            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
-            {
-                if (tid == 0 && level == 0)
-                    status[blockIdx.x] = 0;
-
-                return;
-            }
-
-            float b1 = 0;
-            float b2 = 0;
-
-            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-            {
-                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                {
-                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                    work_type diff = (J_val - I_val) * 32.0f;
-
-                    accum(b1, diff * dIdx_patch[i][j]);
-                    accum(b2, diff * dIdy_patch[i][j]);
-                }
-            }
-
-            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
-
-        #if __CUDA_ARCH__ >= 300
-            if (tid == 0)
-            {
-                smem1[0] = b1;
-                smem2[0] = b2;
-            }
-        #endif
-
-            __syncthreads();
-
-            b1 = smem1[0];
-            b2 = smem2[0];
-
-            float2 delta;
-            delta.x = A12 * b2 - A22 * b1;
-            delta.y = A12 * b1 - A11 * b2;
-
-            nextPt.x += delta.x;
-            nextPt.y += delta.y;
-
-            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                break;
-        }
-
-        float errval = 0;
-        if (calcErr)
-        {
-            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-            {
-                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                {
-                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                    work_type diff = J_val - I_val;
-
-                    accum(errval, abs_(diff));
-                }
-            }
-
-            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
-        }
-
-        if (tid == 0)
-        {
-            nextPt.x += c_halfWin_x;
-            nextPt.y += c_halfWin_y;
-
-            nextPts[blockIdx.x] = nextPt;
-
-            if (calcErr)
-                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
-        }
-    }
-
-    template <int cn, int PATCH_X, int PATCH_Y>
-    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                       int level, dim3 block, cudaStream_t stream)
-    {
-        dim3 grid(ptcount);
-
-        if (level == 0 && err)
-            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-        else
-            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <bool calcErr>
-    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
-    {
-        extern __shared__ int smem[];
-
-        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
-        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
-
-        int* I_patch = smem;
-        int* dIdx_patch = I_patch + patchWidth * patchHeight;
-        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
-
-        const int xBase = blockIdx.x * blockDim.x;
-        const int yBase = blockIdx.y * blockDim.y;
-
-        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
-        {
-            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
-            {
-                float x = xBase - c_halfWin_x + j + 0.5f;
-                float y = yBase - c_halfWin_y + i + 0.5f;
-
-                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
-
-                // Sharr Deriv
-
-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
-
-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
-            }
-        }
-
-        __syncthreads();
-
-        const int x = xBase + threadIdx.x;
-        const int y = yBase + threadIdx.y;
-
-        if (x >= cols || y >= rows)
-            return;
-
-        int A11i = 0;
-        int A12i = 0;
-        int A22i = 0;
-
-        for (int i = 0; i < c_winSize_y; ++i)
-        {
-            for (int j = 0; j < c_winSize_x; ++j)
-            {
-                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                A11i += dIdx * dIdx;
-                A12i += dIdx * dIdy;
-                A22i += dIdy * dIdy;
-            }
-        }
-
-        float A11 = A11i;
-        float A12 = A12i;
-        float A22 = A22i;
-
-        float D = A11 * A22 - A12 * A12;
-
-        if (D < numeric_limits<float>::epsilon())
-        {
-            if (calcErr)
-                err(y, x) = numeric_limits<float>::max();
-
-            return;
-        }
-
-        D = 1.f / D;
-
-        A11 *= D;
-        A12 *= D;
-        A22 *= D;
-
-        float2 nextPt;
-        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
-        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
-
-        for (int k = 0; k < c_iters; ++k)
-        {
-            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
-            {
-                if (calcErr)
-                    err(y, x) = numeric_limits<float>::max();
-
-                return;
-            }
-
-            int b1 = 0;
-            int b2 = 0;
-
-            for (int i = 0; i < c_winSize_y; ++i)
-            {
-                for (int j = 0; j < c_winSize_x; ++j)
-                {
-                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                    int diff = (J - I) * 32;
-
-                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                    b1 += diff * dIdx;
-                    b2 += diff * dIdy;
-                }
-            }
-
-            float2 delta;
-            delta.x = A12 * b2 - A22 * b1;
-            delta.y = A12 * b1 - A11 * b2;
-
-            nextPt.x += delta.x;
-            nextPt.y += delta.y;
-
-            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                break;
-        }
-
-        u(y, x) = nextPt.x - x;
-        v(y, x) = nextPt.y - y;
-
-        if (calcErr)
-        {
-            int errval = 0;
-
-            for (int i = 0; i < c_winSize_y; ++i)
-            {
-                for (int j = 0; j < c_winSize_x; ++j)
-                {
-                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                    errval += ::abs(J - I);
-                }
-            }
-
-            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
-        }
-    }
-
-    void loadConstants(int2 winSize, int iters)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
-
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
-    }
-
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
-
-        static const func_t funcs[5][5] =
-        {
-            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
-            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
-            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
-            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
-            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
-        };
-
-        bindTexture(&tex_If, I);
-        bindTexture(&tex_Jf, J);
-
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-            level, block, stream);
-    }
-
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
-
-        static const func_t funcs[5][5] =
-        {
-            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
-            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
-            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
-            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
-            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
-        };
-
-        bindTexture(&tex_If4, I);
-        bindTexture(&tex_Jf4, J);
-
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-            level, block, stream);
-    }
-
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
-    {
-        dim3 block(16, 16);
-        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-
-        bindTexture(&tex_Ib, I);
-        bindTexture(&tex_Jf, J);
-
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        const int patchWidth  = block.x + 2 * halfWin.x;
-        const int patchHeight = block.y + 2 * halfWin.y;
-        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
-
-        if (err.data)
-        {
-            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-        else
-        {
-            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/tvl1flow.cu
+++ b/modules/gpu/src/cuda/tvl1flow.cu
@@ -1,332 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-////////////////////////////////////////////////////////////
-// centeredGradient
-
-namespace tvl1flow
-{
-    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= src.cols || y >= src.rows)
-            return;
-
-        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
-        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
-    }
-
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// warpBackward
-
-namespace tvl1flow
-{
-    static __device__ __forceinline__ float bicubicCoeff(float x_)
-    {
-        float x = fabsf(x_);
-        if (x <= 1.0f)
-        {
-            return x * x * (1.5f * x - 2.5f) + 1.0f;
-        }
-        else if (x < 2.0f)
-        {
-            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-        }
-        else
-        {
-            return 0.0f;
-        }
-    }
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= I0.cols || y >= I0.rows)
-            return;
-
-        const float u1Val = u1(y, x);
-        const float u2Val = u2(y, x);
-
-        const float wx = x + u1Val;
-        const float wy = y + u2Val;
-
-        const int xmin = ::ceilf(wx - 2.0f);
-        const int xmax = ::floorf(wx + 2.0f);
-
-        const int ymin = ::ceilf(wy - 2.0f);
-        const int ymax = ::floorf(wy + 2.0f);
-
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                sum  += w * tex2D(tex_I1 , cx, cy);
-                sumx += w * tex2D(tex_I1x, cx, cy);
-                sumy += w * tex2D(tex_I1y, cx, cy);
-
-                wsum += w;
-            }
-        }
-
-        const float coeff = 1.0f / wsum;
-
-        const float I1wVal  = sum  * coeff;
-        const float I1wxVal = sumx * coeff;
-        const float I1wyVal = sumy * coeff;
-
-        I1w(y, x)  = I1wVal;
-        I1wx(y, x) = I1wxVal;
-        I1wy(y, x) = I1wyVal;
-
-        const float Ix2 = I1wxVal * I1wxVal;
-        const float Iy2 = I1wyVal * I1wyVal;
-
-        // store the |Grad(I1)|^2
-        grad(y, x) = Ix2 + Iy2;
-
-        // compute the constant part of the rho function
-        const float I0Val = I0(y, x);
-        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
-
-        bindTexture(&tex_I1 , I1);
-        bindTexture(&tex_I1x, I1x);
-        bindTexture(&tex_I1y, I1y);
-
-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// estimateU
-
-namespace tvl1flow
-{
-    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
-    {
-        if (x > 0 && y > 0)
-        {
-            const float v1x = v1(y, x) - v1(y, x - 1);
-            const float v2y = v2(y, x) - v2(y - 1, x);
-            return v1x + v2y;
-        }
-        else
-        {
-            if (y > 0)
-                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
-            else
-            {
-                if (x > 0)
-                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
-                else
-                    return v1(0, 0) + v2(0, 0);
-            }
-        }
-    }
-
-    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
-                              const PtrStepf grad, const PtrStepf rho_c,
-                              const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
-                              PtrStepf u1, PtrStepf u2, PtrStepf error,
-                              const float l_t, const float theta)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= I1wx.cols || y >= I1wx.rows)
-            return;
-
-        const float I1wxVal = I1wx(y, x);
-        const float I1wyVal = I1wy(y, x);
-        const float gradVal = grad(y, x);
-        const float u1OldVal = u1(y, x);
-        const float u2OldVal = u2(y, x);
-
-        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
-
-        // estimate the values of the variable (v1, v2) (thresholding operator TH)
-
-        float d1 = 0.0f;
-        float d2 = 0.0f;
-
-        if (rho < -l_t * gradVal)
-        {
-            d1 = l_t * I1wxVal;
-            d2 = l_t * I1wyVal;
-        }
-        else if (rho > l_t * gradVal)
-        {
-            d1 = -l_t * I1wxVal;
-            d2 = -l_t * I1wyVal;
-        }
-        else if (gradVal > numeric_limits<float>::epsilon())
-        {
-            const float fi = -rho / gradVal;
-            d1 = fi * I1wxVal;
-            d2 = fi * I1wyVal;
-        }
-
-        const float v1 = u1OldVal + d1;
-        const float v2 = u2OldVal + d2;
-
-        // compute the divergence of the dual variable (p1, p2)
-
-        const float div_p1 = divergence(p11, p12, y, x);
-        const float div_p2 = divergence(p21, p22, y, x);
-
-        // estimate the values of the optical flow (u1, u2)
-
-        const float u1NewVal = v1 + theta * div_p1;
-        const float u2NewVal = v2 + theta * div_p2;
-
-        u1(y, x) = u1NewVal;
-        u2(y, x) = u2NewVal;
-
-        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
-        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
-        error(y, x) = n1 + n2;
-    }
-
-    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
-                   PtrStepSzf grad, PtrStepSzf rho_c,
-                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
-                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
-                   float l_t, float theta)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
-
-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// estimateDualVariables
-
-namespace tvl1flow
-{
-    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= u1.cols || y >= u1.rows)
-            return;
-
-        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
-        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
-
-        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
-        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
-
-        const float g1 = ::hypotf(u1x, u1y);
-        const float g2 = ::hypotf(u2x, u2y);
-
-        const float ng1 = 1.0f + taut * g1;
-        const float ng2 = 1.0f + taut * g2;
-
-        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
-        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
-        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
-        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
-    }
-
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
-
-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@@ -1,753 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-class cv::gpu::FGDStatModel::Impl
-{
-};
-
-cv::gpu::FGDStatModel::Params::Params() { throw_no_cuda(); }
-
-cv::gpu::FGDStatModel::FGDStatModel(int) { throw_no_cuda(); }
-cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat&, const Params&, int) { throw_no_cuda(); }
-cv::gpu::FGDStatModel::~FGDStatModel() {}
-void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat&, const Params&) { throw_no_cuda(); }
-void cv::gpu::FGDStatModel::release() {}
-int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat&) { throw_no_cuda(); return 0; }
-
-#else
-
-#include "fgd_bgfg_common.hpp"
-#include "opencv2/imgproc/imgproc_c.h"
-
-namespace
-{
-    class BGPixelStat
-    {
-    public:
-        void create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn);
-        void release();
-
-        void setTrained();
-
-        operator bgfg::BGPixelStat();
-
-    private:
-        cv::gpu::GpuMat Pbc_;
-        cv::gpu::GpuMat Pbcc_;
-        cv::gpu::GpuMat is_trained_st_model_;
-        cv::gpu::GpuMat is_trained_dyn_model_;
-
-        cv::gpu::GpuMat ctable_Pv_;
-        cv::gpu::GpuMat ctable_Pvb_;
-        cv::gpu::GpuMat ctable_v_;
-
-        cv::gpu::GpuMat cctable_Pv_;
-        cv::gpu::GpuMat cctable_Pvb_;
-        cv::gpu::GpuMat cctable_v1_;
-        cv::gpu::GpuMat cctable_v2_;
-    };
-
-    void BGPixelStat::create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn)
-    {
-        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbc_);
-        Pbc_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbcc_);
-        Pbcc_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_st_model_);
-        is_trained_st_model_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_dyn_model_);
-        is_trained_dyn_model_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pv_);
-        ctable_Pv_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pvb_);
-        ctable_Pvb_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_8UC(out_cn), ctable_v_);
-        ctable_v_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pv_);
-        cctable_Pv_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pvb_);
-        cctable_Pvb_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v1_);
-        cctable_v1_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v2_);
-        cctable_v2_.setTo(cv::Scalar::all(0));
-    }
-
-    void BGPixelStat::release()
-    {
-        Pbc_.release();
-        Pbcc_.release();
-        is_trained_st_model_.release();
-        is_trained_dyn_model_.release();
-
-        ctable_Pv_.release();
-        ctable_Pvb_.release();
-        ctable_v_.release();
-
-        cctable_Pv_.release();
-        cctable_Pvb_.release();
-        cctable_v1_.release();
-        cctable_v2_.release();
-    }
-
-    void BGPixelStat::setTrained()
-    {
-        is_trained_st_model_.setTo(cv::Scalar::all(1));
-        is_trained_dyn_model_.setTo(cv::Scalar::all(1));
-    }
-
-    BGPixelStat::operator bgfg::BGPixelStat()
-    {
-        bgfg::BGPixelStat stat;
-
-        stat.rows_ = Pbc_.rows;
-
-        stat.Pbc_data_ = Pbc_.data;
-        stat.Pbc_step_ = Pbc_.step;
-
-        stat.Pbcc_data_ = Pbcc_.data;
-        stat.Pbcc_step_ = Pbcc_.step;
-
-        stat.is_trained_st_model_data_ = is_trained_st_model_.data;
-        stat.is_trained_st_model_step_ = is_trained_st_model_.step;
-
-        stat.is_trained_dyn_model_data_ = is_trained_dyn_model_.data;
-        stat.is_trained_dyn_model_step_ = is_trained_dyn_model_.step;
-
-        stat.ctable_Pv_data_ = ctable_Pv_.data;
-        stat.ctable_Pv_step_ = ctable_Pv_.step;
-
-        stat.ctable_Pvb_data_ = ctable_Pvb_.data;
-        stat.ctable_Pvb_step_ = ctable_Pvb_.step;
-
-        stat.ctable_v_data_ = ctable_v_.data;
-        stat.ctable_v_step_ = ctable_v_.step;
-
-        stat.cctable_Pv_data_ = cctable_Pv_.data;
-        stat.cctable_Pv_step_ = cctable_Pv_.step;
-
-        stat.cctable_Pvb_data_ = cctable_Pvb_.data;
-        stat.cctable_Pvb_step_ = cctable_Pvb_.step;
-
-        stat.cctable_v1_data_ = cctable_v1_.data;
-        stat.cctable_v1_step_ = cctable_v1_.step;
-
-        stat.cctable_v2_data_ = cctable_v2_.data;
-        stat.cctable_v2_step_ = cctable_v2_.step;
-
-        return stat;
-    }
-}
-
-class cv::gpu::FGDStatModel::Impl
-{
-public:
-    Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn);
-    ~Impl();
-
-    void create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params);
-    void release();
-
-    int update(const cv::gpu::GpuMat& curFrame);
-
-private:
-    Impl(const Impl&);
-    Impl& operator=(const Impl&);
-
-    int out_cn_;
-
-    cv::gpu::FGDStatModel::Params params_;
-
-    cv::gpu::GpuMat& background_;
-    cv::gpu::GpuMat& foreground_;
-    std::vector< std::vector<cv::Point> >& foreground_regions_;
-
-    cv::Mat h_foreground_;
-
-    cv::gpu::GpuMat prevFrame_;
-    cv::gpu::GpuMat Ftd_;
-    cv::gpu::GpuMat Fbd_;
-    BGPixelStat stat_;
-
-    cv::gpu::GpuMat hist_;
-    cv::gpu::GpuMat histBuf_;
-
-    cv::gpu::GpuMat countBuf_;
-
-    cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
-    cv::gpu::GpuMat filterBrd_;
-
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
-
-    CvMemStorage* storage_;
-};
-
-cv::gpu::FGDStatModel::Impl::Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn) :
-    out_cn_(out_cn), background_(background), foreground_(foreground), foreground_regions_(foreground_regions)
-{
-    CV_Assert( out_cn_ == 3 || out_cn_ == 4 );
-
-    storage_ = cvCreateMemStorage();
-    CV_Assert( storage_ != 0 );
-}
-
-cv::gpu::FGDStatModel::Impl::~Impl()
-{
-    cvReleaseMemStorage(&storage_);
-}
-
-namespace
-{
-    void copyChannels(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, int dst_cn = -1)
-    {
-        const int src_cn = src.channels();
-
-        if (dst_cn < 0)
-            dst_cn = src_cn;
-
-        cv::gpu::ensureSizeIsEnough(src.size(), CV_MAKE_TYPE(src.depth(), dst_cn), dst);
-
-        if (src_cn == dst_cn)
-            src.copyTo(dst);
-        else
-        {
-            static const int cvt_codes[4][4] =
-            {
-                {-1, -1, cv::COLOR_GRAY2BGR, cv::COLOR_GRAY2BGRA},
-                {-1, -1, -1, -1},
-                {cv::COLOR_BGR2GRAY, -1, -1, cv::COLOR_BGR2BGRA},
-                {cv::COLOR_BGRA2GRAY, -1, cv::COLOR_BGRA2BGR, -1}
-            };
-
-            const int cvt_code = cvt_codes[src_cn - 1][dst_cn - 1];
-            CV_DbgAssert( cvt_code >= 0 );
-
-            cv::gpu::cvtColor(src, dst, cvt_code, dst_cn);
-        }
-    }
-}
-
-void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params)
-{
-    CV_Assert(firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
-
-    params_ = params;
-
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, foreground_);
-
-    copyChannels(firstFrame, background_, out_cn_);
-
-    copyChannels(firstFrame, prevFrame_);
-
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Ftd_);
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Fbd_);
-
-    stat_.create(firstFrame.size(), params_, out_cn_);
-    bgfg::setBGPixelStat(stat_);
-
-    if (params_.perform_morphing > 0)
-    {
-        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
-        cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
-
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
-    }
-}
-
-void cv::gpu::FGDStatModel::Impl::release()
-{
-    background_.release();
-    foreground_.release();
-
-    prevFrame_.release();
-    Ftd_.release();
-    Fbd_.release();
-    stat_.release();
-
-    hist_.release();
-    histBuf_.release();
-
-    countBuf_.release();
-
-    buf_.release();
-    filterBuf_.release();
-    filterBrd_.release();
-}
-
-/////////////////////////////////////////////////////////////////////////
-// changeDetection
-
-namespace
-{
-    void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-        static const func_t funcs[4][4] =
-        {
-            {0,0,0,0},
-            {0,0,0,0},
-            {0,0,bgfg::calcDiffHistogram_gpu<uchar3, uchar3>,bgfg::calcDiffHistogram_gpu<uchar3, uchar4>},
-            {0,0,bgfg::calcDiffHistogram_gpu<uchar4, uchar3>,bgfg::calcDiffHistogram_gpu<uchar4, uchar4>}
-        };
-
-        hist.create(3, 256, CV_32SC1);
-        histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
-                    prevFrame, curFrame,
-                    hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
-                    histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
-                    cv::gpu::deviceSupports(cv::gpu::FEATURE_SET_COMPUTE_20), 0);
-    }
-
-    void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
-    {
-        std::memset(relativeVariance, 0, 3 * bgfg::HISTOGRAM_BIN_COUNT * sizeof(double));
-
-        for (int thres = bgfg::HISTOGRAM_BIN_COUNT - 2; thres >= 0; --thres)
-        {
-            cv::Vec3d sum(0.0, 0.0, 0.0);
-            cv::Vec3d sqsum(0.0, 0.0, 0.0);
-            cv::Vec3i count(0, 0, 0);
-
-            for (int j = thres; j < bgfg::HISTOGRAM_BIN_COUNT; ++j)
-            {
-                sum[0]   += static_cast<double>(j) * hist[j];
-                sqsum[0] += static_cast<double>(j * j) * hist[j];
-                count[0] += hist[j];
-
-                sum[1]   += static_cast<double>(j) * hist[j + 256];
-                sqsum[1] += static_cast<double>(j * j) * hist[j + 256];
-                count[1] += hist[j + 256];
-
-                sum[2]   += static_cast<double>(j) * hist[j + 512];
-                sqsum[2] += static_cast<double>(j * j) * hist[j + 512];
-                count[2] += hist[j + 512];
-            }
-
-            count[0] = std::max(count[0], 1);
-            count[1] = std::max(count[1], 1);
-            count[2] = std::max(count[2], 1);
-
-            cv::Vec3d my(
-                sum[0] / count[0],
-                sum[1] / count[1],
-                sum[2] / count[2]
-            );
-
-            relativeVariance[0][thres] = std::sqrt(sqsum[0] / count[0] - my[0] * my[0]);
-            relativeVariance[1][thres] = std::sqrt(sqsum[1] / count[1] - my[1] * my[1]);
-            relativeVariance[2][thres] = std::sqrt(sqsum[2] / count[2] - my[2] * my[2]);
-        }
-    }
-
-    void calcDiffThreshMask(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::Vec3d bestThres, cv::gpu::GpuMat& changeMask)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
-        static const func_t funcs[4][4] =
-        {
-            {0,0,0,0},
-            {0,0,0,0},
-            {0,0,bgfg::calcDiffThreshMask_gpu<uchar3, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar3, uchar4>},
-            {0,0,bgfg::calcDiffThreshMask_gpu<uchar4, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar4, uchar4>}
-        };
-
-        changeMask.setTo(cv::Scalar::all(0));
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](prevFrame, curFrame, make_uchar3((uchar)bestThres[0], (uchar)bestThres[1], (uchar)bestThres[2]), changeMask, 0);
-    }
-
-    // performs change detection for Foreground detection algorithm
-    void changeDetection(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& changeMask, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
-    {
-        calcDiffHistogram(prevFrame, curFrame, hist, histBuf);
-
-        unsigned int histData[3 * 256];
-        cv::Mat h_hist(3, 256, CV_32SC1, histData);
-        hist.download(h_hist);
-
-        double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT];
-        calcRelativeVariance(histData, relativeVariance);
-
-        // Find maximum:
-        cv::Vec3d bestThres(10.0, 10.0, 10.0);
-        for (int i = 0; i < bgfg::HISTOGRAM_BIN_COUNT; ++i)
-        {
-            bestThres[0] = std::max(bestThres[0], relativeVariance[0][i]);
-            bestThres[1] = std::max(bestThres[1], relativeVariance[1][i]);
-            bestThres[2] = std::max(bestThres[2], relativeVariance[2][i]);
-        }
-
-        calcDiffThreshMask(prevFrame, curFrame, bestThres, changeMask);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// bgfgClassification
-
-namespace
-{
-    int bgfgClassification(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame,
-                           const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
-                           cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& countBuf,
-                           const cv::gpu::FGDStatModel::Params& params, int out_cn)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
-                               int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-        static const func_t funcs[4][4][4] =
-        {
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar4>},
-                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar4>}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar4>},
-                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar4>}
-            }
-        };
-
-        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
-        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][out_cn - 1](prevFrame, curFrame, Ftd, Fbd, foreground, deltaC, deltaCC, params.alpha2, params.N1c, params.N1cc, 0);
-
-        int count = cv::gpu::countNonZero(foreground, countBuf);
-
-        cv::gpu::multiply(foreground, cv::Scalar::all(255), foreground);
-
-        return count;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// smoothForeground
-
-namespace
-{
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
-    {
-        cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-
-    void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
-                          const cv::gpu::FGDStatModel::Params& params)
-    {
-        const int brd = params.perform_morphing;
-
-        const cv::Scalar erodeBrdVal = cv::Scalar::all(UCHAR_MAX);
-        const cv::Scalar dilateBrdVal = cv::Scalar::all(0);
-
-        // MORPH_OPEN
-        morphology(foreground, buf, filterBrd, brd, erodeFilter, erodeBrdVal);
-        morphology(buf, foreground, filterBrd, brd, dilateFilter, dilateBrdVal);
-
-        // MORPH_CLOSE
-        morphology(foreground, buf, filterBrd, brd, dilateFilter, dilateBrdVal);
-        morphology(buf, foreground, filterBrd, brd, erodeFilter, erodeBrdVal);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// findForegroundRegions
-
-namespace
-{
-    void seqToContours(CvSeq* _ccontours, CvMemStorage* storage, cv::OutputArrayOfArrays _contours)
-    {
-        cv::Seq<CvSeq*> all_contours(cvTreeToNodeSeq(_ccontours, sizeof(CvSeq), storage));
-
-        size_t total = all_contours.size();
-
-        _contours.create((int) total, 1, 0, -1, true);
-
-        cv::SeqIterator<CvSeq*> it = all_contours.begin();
-        for (size_t i = 0; i < total; ++i, ++it)
-        {
-            CvSeq* c = *it;
-            ((CvContour*)c)->color = (int)i;
-            _contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
-            cv::Mat ci = _contours.getMat((int)i);
-            CV_Assert( ci.isContinuous() );
-            cvCvtSeqToArray(c, ci.data);
-        }
-    }
-
-    int findForegroundRegions(cv::gpu::GpuMat& d_foreground, cv::Mat& h_foreground, std::vector< std::vector<cv::Point> >& foreground_regions,
-                              CvMemStorage* storage, const cv::gpu::FGDStatModel::Params& params)
-    {
-        int region_count = 0;
-
-        // Discard under-size foreground regions:
-
-        d_foreground.download(h_foreground);
-        IplImage ipl_foreground = h_foreground;
-        CvSeq* first_seq = 0;
-
-        cvFindContours(&ipl_foreground, storage, &first_seq, sizeof(CvContour), CV_RETR_LIST);
-
-        for (CvSeq* seq = first_seq; seq; seq = seq->h_next)
-        {
-            CvContour* cnt = reinterpret_cast<CvContour*>(seq);
-
-            if (cnt->rect.width * cnt->rect.height < params.minArea || (params.is_obj_without_holes && CV_IS_SEQ_HOLE(seq)))
-            {
-                // Delete under-size contour:
-                CvSeq* prev_seq = seq->h_prev;
-                if (prev_seq)
-                {
-                    prev_seq->h_next = seq->h_next;
-
-                    if (seq->h_next)
-                        seq->h_next->h_prev = prev_seq;
-                }
-                else
-                {
-                    first_seq = seq->h_next;
-
-                    if (seq->h_next)
-                        seq->h_next->h_prev = NULL;
-                }
-            }
-            else
-            {
-                region_count++;
-            }
-        }
-
-        seqToContours(first_seq, storage, foreground_regions);
-        h_foreground.setTo(0);
-
-        cv::drawContours(h_foreground, foreground_regions, -1, cv::Scalar::all(255), -1);
-
-        d_foreground.upload(h_foreground);
-
-        return region_count;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// updateBackgroundModel
-
-namespace
-{
-    void updateBackgroundModel(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
-                               const cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& background,
-                               const cv::gpu::FGDStatModel::Params& params)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd,
-                               cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
-                               int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-        static const func_t funcs[4][4][4] =
-        {
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar4>},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar4>}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar4>},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar4>}
-            }
-        };
-
-        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
-        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][background.channels() - 1](
-                    prevFrame, curFrame, Ftd, Fbd, foreground, background,
-                    deltaC, deltaCC, params.alpha1, params.alpha2, params.alpha3, params.N1c, params.N1cc, params.N2c, params.N2cc, params.T,
-                    0);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// Impl::update
-
-int cv::gpu::FGDStatModel::Impl::update(const cv::gpu::GpuMat& curFrame)
-{
-    CV_Assert(curFrame.type() == CV_8UC3 || curFrame.type() == CV_8UC4);
-    CV_Assert(curFrame.size() == prevFrame_.size());
-
-    cvClearMemStorage(storage_);
-    foreground_regions_.clear();
-    foreground_.setTo(cv::Scalar::all(0));
-
-    changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
-    changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);
-
-    int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, out_cn_);
-
-    if (params_.perform_morphing > 0)
-        smoothForeground(foreground_, filterBrd_, buf_, erodeFilter_, dilateFilter_, params_);
-
-    int region_count = 0;
-    if (params_.minArea > 0 || params_.is_obj_without_holes)
-        region_count = findForegroundRegions(foreground_, h_foreground_, foreground_regions_, storage_, params_);
-
-    // Check ALL BG update condition:
-    const double BGFG_FGD_BG_UPDATE_TRESH = 0.5;
-    if (static_cast<double>(FG_pixels_count) / Ftd_.size().area() > BGFG_FGD_BG_UPDATE_TRESH)
-        stat_.setTrained();
-
-    updateBackgroundModel(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, background_, params_);
-
-    copyChannels(curFrame, prevFrame_);
-
-    return region_count;
-}
-
-namespace
-{
-    // Default parameters of foreground detection algorithm:
-    const int BGFG_FGD_LC  = 128;
-    const int BGFG_FGD_N1C = 15;
-    const int BGFG_FGD_N2C = 25;
-
-    const int BGFG_FGD_LCC   = 64;
-    const int BGFG_FGD_N1CC = 25;
-    const int BGFG_FGD_N2CC = 40;
-
-    // Background reference image update parameter:
-    const float BGFG_FGD_ALPHA_1 = 0.1f;
-
-    // stat model update parameter
-    // 0.002f ~ 1K frame(~45sec), 0.005 ~ 18sec (if 25fps and absolutely static BG)
-    const float BGFG_FGD_ALPHA_2 = 0.005f;
-
-    // start value for alpha parameter (to fast initiate statistic model)
-    const float BGFG_FGD_ALPHA_3 = 0.1f;
-
-    const float BGFG_FGD_DELTA = 2.0f;
-
-    const float BGFG_FGD_T = 0.9f;
-
-    const float BGFG_FGD_MINAREA= 15.0f;
-}
-
-cv::gpu::FGDStatModel::Params::Params()
-{
-    Lc      = BGFG_FGD_LC;
-    N1c     = BGFG_FGD_N1C;
-    N2c     = BGFG_FGD_N2C;
-
-    Lcc     = BGFG_FGD_LCC;
-    N1cc    = BGFG_FGD_N1CC;
-    N2cc    = BGFG_FGD_N2CC;
-
-    delta   = BGFG_FGD_DELTA;
-
-    alpha1  = BGFG_FGD_ALPHA_1;
-    alpha2  = BGFG_FGD_ALPHA_2;
-    alpha3  = BGFG_FGD_ALPHA_3;
-
-    T       = BGFG_FGD_T;
-    minArea = BGFG_FGD_MINAREA;
-
-    is_obj_without_holes = true;
-    perform_morphing     = 1;
-}
-
-cv::gpu::FGDStatModel::FGDStatModel(int out_cn)
-{
-    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
-}
-
-cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params, int out_cn)
-{
-    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
-    create(firstFrame, params);
-}
-
-cv::gpu::FGDStatModel::~FGDStatModel()
-{
-}
-
-void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat& firstFrame, const Params& params)
-{
-    impl_->create(firstFrame, params);
-}
-
-void cv::gpu::FGDStatModel::release()
-{
-    impl_->release();
-}
-
-int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat& curFrame)
-{
-    return impl_->update(curFrame);
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/optflowbm.cpp
+++ b/modules/gpu/src/optflowbm.cpp
@@ -1,242 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::calcOpticalFlowBM(const GpuMat&, const GpuMat&, Size, Size, Size, bool, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, int, int, Stream&) { throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-namespace optflowbm
-{
-    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
-              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream);
-}
-
-void cv::gpu::calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr, Size blockSize, Size shiftSize, Size maxRange, bool usePrevious, GpuMat& velx, GpuMat& vely, GpuMat& buf, Stream& st)
-{
-    CV_Assert( prev.type() == CV_8UC1 );
-    CV_Assert( curr.size() == prev.size() && curr.type() == prev.type() );
-
-    const Size velSize((prev.cols - blockSize.width + shiftSize.width) / shiftSize.width,
-                       (prev.rows - blockSize.height + shiftSize.height) / shiftSize.height);
-
-    velx.create(velSize, CV_32FC1);
-    vely.create(velSize, CV_32FC1);
-
-    // scanning scheme coordinates
-    std::vector<short2> ss((2 * maxRange.width + 1) * (2 * maxRange.height + 1));
-    int ssCount = 0;
-
-    // Calculate scanning scheme
-    const int minCount = std::min(maxRange.width, maxRange.height);
-
-    // use spiral search pattern
-    //
-    //     9 10 11 12
-    //     8  1  2 13
-    //     7  *  3 14
-    //     6  5  4 15
-    //... 20 19 18 17
-    //
-
-    for (int i = 0; i < minCount; ++i)
-    {
-        // four cycles along sides
-        int x = -i - 1, y = x;
-
-        // upper side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = ++x;
-            ss[ssCount].y = y;
-        }
-
-        // right side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = x;
-            ss[ssCount].y = ++y;
-        }
-
-        // bottom side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = --x;
-            ss[ssCount].y = y;
-        }
-
-        // left side
-        for (int j = -i; j <= i + 1; ++j, ++ssCount)
-        {
-            ss[ssCount].x = x;
-            ss[ssCount].y = --y;
-        }
-    }
-
-    // the rest part
-    if (maxRange.width < maxRange.height)
-    {
-        const int xleft = -minCount;
-
-        // cycle by neighbor rings
-        for (int i = minCount; i < maxRange.height; ++i)
-        {
-            // two cycles by x
-            int y = -(i + 1);
-            int x = xleft;
-
-            // upper side
-            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-
-            x = xleft;
-            y = -y;
-
-            // bottom side
-            for (int j = -maxRange.width; j <= maxRange.width; ++j, ++ssCount, ++x)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-        }
-    }
-    else if (maxRange.width > maxRange.height)
-    {
-        const int yupper = -minCount;
-
-        // cycle by neighbor rings
-        for (int i = minCount; i < maxRange.width; ++i)
-        {
-            // two cycles by y
-            int x = -(i + 1);
-            int y = yupper;
-
-            // left side
-            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-
-            y = yupper;
-            x = -x;
-
-            // right side
-            for (int j = -maxRange.height; j <= maxRange.height; ++j, ++ssCount, ++y)
-            {
-                ss[ssCount].x = x;
-                ss[ssCount].y = y;
-            }
-        }
-    }
-
-    const cudaStream_t stream = StreamAccessor::getStream(st);
-
-    ensureSizeIsEnough(1, ssCount, CV_16SC2, buf);
-    if (stream == 0)
-        cudaSafeCall( cudaMemcpy(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice) );
-    else
-        cudaSafeCall( cudaMemcpyAsync(buf.data, &ss[0], ssCount * sizeof(short2), cudaMemcpyHostToDevice, stream) );
-
-    const int maxX = prev.cols - blockSize.width;
-    const int maxY = prev.rows - blockSize.height;
-
-    const int SMALL_DIFF = 2;
-    const int BIG_DIFF = 128;
-
-    const int blSize = blockSize.area();
-    const int acceptLevel = blSize * SMALL_DIFF;
-    const int escapeLevel = blSize * BIG_DIFF;
-
-    optflowbm::calc(prev, curr, velx, vely,
-                    make_int2(blockSize.width, blockSize.height), make_int2(shiftSize.width, shiftSize.height), usePrevious,
-                    maxX, maxY, acceptLevel, escapeLevel, buf.ptr<short2>(), ssCount, stream);
-}
-
-namespace optflowbm_fast
-{
-    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
-
-    template <typename T>
-    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
-}
-
-void cv::gpu::FastOpticalFlowBM::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window, int block_window, Stream& stream)
-{
-    CV_Assert( I0.type() == CV_8UC1 );
-    CV_Assert( I1.size() == I0.size() && I1.type() == I0.type() );
-
-    int border_size = search_window / 2 + block_window / 2;
-    Size esize = I0.size() + Size(border_size, border_size) * 2;
-
-    ensureSizeIsEnough(esize, I0.type(), extended_I0);
-    ensureSizeIsEnough(esize, I0.type(), extended_I1);
-
-    copyMakeBorder(I0, extended_I0, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
-    copyMakeBorder(I1, extended_I1, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
-
-    GpuMat I0_hdr = extended_I0(Rect(Point2i(border_size, border_size), I0.size()));
-    GpuMat I1_hdr = extended_I1(Rect(Point2i(border_size, border_size), I0.size()));
-
-    int bcols, brows;
-    optflowbm_fast::get_buffer_size(I0.cols, I0.rows, search_window, block_window, bcols, brows);
-
-    ensureSizeIsEnough(brows, bcols, CV_32SC1, buffer);
-
-    flowx.create(I0.size(), CV_32FC1);
-    flowy.create(I0.size(), CV_32FC1);
-
-    optflowbm_fast::calc<uchar>(I0_hdr, I1_hdr, flowx, flowy, buffer, search_window, block_window, StreamAccessor::getStream(stream));
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -1,237 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::createOpticalFlowNeedleMap(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-
-#else
-
-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
-    {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-
-        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );
-
-        return gpuCounter.maxSize();
-    }
-}
-
-namespace
-{
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
-
-void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
-
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
-
-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
-
-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
-
-    NCVBroxOpticalFlowDescriptor desc;
-
-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
-
-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
-
-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
-
-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
-
-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
-
-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
-
-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
-
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
-
-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
-}
-
-void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
-                                float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
-{
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
-    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
-    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
-    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
-    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
-
-    newFrame.create(frame0.size(), frame0.type());
-
-    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
-    buf.setTo(Scalar::all(0));
-
-    // occlusion masks
-    GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
-    GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
-
-    // interpolated forward flow
-    GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
-    GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
-
-    // interpolated backward flow
-    GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
-    GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
-
-    size_t step = frame0.step;
-
-    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStStreamHandler h(stream);
-
-    NppStInterpolationState state;
-
-    state.size         = NcvSize32u(frame0.cols, frame0.rows);
-    state.nStep        = static_cast<Ncv32u>(step);
-    state.pSrcFrame0   = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
-    state.pSrcFrame1   = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
-    state.pFU          = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
-    state.pFV          = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
-    state.pBU          = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
-    state.pBV          = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
-    state.pos          = pos;
-    state.pNewFrame    = newFrame.ptr<Ncv32f>();
-    state.ppBuffers[0] = occ0.ptr<Ncv32f>();
-    state.ppBuffers[1] = occ1.ptr<Ncv32f>();
-    state.ppBuffers[2] = fui.ptr<Ncv32f>();
-    state.ppBuffers[3] = fvi.ptr<Ncv32f>();
-    state.ppBuffers[4] = bui.ptr<Ncv32f>();
-    state.ppBuffers[5] = bvi.ptr<Ncv32f>();
-
-    ncvSafeCall( nppiStInterpolateFrames(&state) );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace optical_flow
-    {
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg);
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale);
-    }
-}}}
-
-void cv::gpu::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors)
-{
-    using namespace cv::gpu::cudev::optical_flow;
-
-    CV_Assert(u.type() == CV_32FC1);
-    CV_Assert(v.type() == u.type() && v.size() == u.size());
-
-    const int NEEDLE_MAP_SCALE = 16;
-
-    const int x_needles = u.cols / NEEDLE_MAP_SCALE;
-    const int y_needles = u.rows / NEEDLE_MAP_SCALE;
-
-    GpuMat u_avg(y_needles, x_needles, CV_32FC1);
-    GpuMat v_avg(y_needles, x_needles, CV_32FC1);
-
-    NeedleMapAverage_gpu(u, v, u_avg, v_avg);
-
-    const int NUM_VERTS_PER_ARROW = 6;
-
-    const int num_arrows = x_needles * y_needles * NUM_VERTS_PER_ARROW;
-
-    vertex.create(1, num_arrows, CV_32FC3);
-    colors.create(1, num_arrows, CV_32FC3);
-
-    colors.setTo(Scalar::all(1.0));
-
-    double uMax, vMax;
-    minMax(u_avg, 0, &uMax);
-    minMax(v_avg, 0, &vMax);
-
-    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));
-
-    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);
-
-    cvtColor(colors, colors, COLOR_HSV2RGB);
-}
-
-#endif /* HAVE_CUDA */
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@@ -1,409 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#define MIN_SIZE 32
-
-#define S(x) StreamAccessor::getStream(x)
-
-// GPU resize() is fast, but it differs from the CPU analog. Disabling this flag
-// leads to an inefficient code. It's for debug purposes only.
-#define ENABLE_GPU_RESIZE 1
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::FarnebackOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-{
-    void setPolynomialExpansionConsts(
-            int polyN, const float *g, const float *xg, const float *xxg,
-            float ig11, float ig03, float ig33, float ig55);
-
-    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream);
-
-    void setUpdateMatricesConsts();
-
-    void updateMatricesGpu(
-            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
-            PtrStepSzf M, cudaStream_t stream);
-
-    void updateFlowGpu(
-            const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream);
-
-    /*void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);*/
-
-    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
-
-    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream);
-
-    void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
-
-    void gaussianBlurGpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
-
-    void gaussianBlur5Gpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
-
-    void gaussianBlur5Gpu_CC11(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderType, cudaStream_t stream);
-
-}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-
-
-void cv::gpu::FarnebackOpticalFlow::prepareGaussian(
-        int n, double sigma, float *g, float *xg, float *xxg,
-        double &ig11, double &ig03, double &ig33, double &ig55)
-{
-    double s = 0.;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
-        s += g[x];
-    }
-
-    s = 1./s;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)(g[x]*s);
-        xg[x] = (float)(x*g[x]);
-        xxg[x] = (float)(x*x*g[x]);
-    }
-
-    Mat_<double> G(6, 6);
-    G.setTo(0);
-
-    for (int y = -n; y <= n; y++)
-    {
-        for (int x = -n; x <= n; x++)
-        {
-            G(0,0) += g[y]*g[x];
-            G(1,1) += g[y]*g[x]*x*x;
-            G(3,3) += g[y]*g[x]*x*x*x*x;
-            G(5,5) += g[y]*g[x]*x*x*y*y;
-        }
-    }
-
-    //G[0][0] = 1.;
-    G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
-    G(4,4) = G(3,3);
-    G(3,4) = G(4,3) = G(5,5);
-
-    // invG:
-    // [ x        e  e    ]
-    // [    y             ]
-    // [       y          ]
-    // [ e        z       ]
-    // [ e           z    ]
-    // [                u ]
-    Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
-
-    ig11 = invG(1,1);
-    ig03 = invG(0,3);
-    ig33 = invG(3,3);
-    ig55 = invG(5,5);
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
-{
-    std::vector<float> buf(n*6 + 3);
-    float* g = &buf[0] + n;
-    float* xg = g + n*2 + 1;
-    float* xxg = xg + n*2 + 1;
-
-    if (sigma < FLT_EPSILON)
-        sigma = n*0.3;
-
-    double ig11, ig03, ig33, ig55;
-    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
-
-    cudev::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        cudev::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
-    else
-        cudev::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
-    swap(M, bufM);
-
-    for (int i = 1; i < 5; ++i)
-        streams[i].waitForCompletion();
-    cudev::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        cudev::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
-        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
-{
-    if (deviceSupports(FEATURE_SET_COMPUTE_12))
-        cudev::optflow_farneback::gaussianBlur5Gpu(
-                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
-    else
-        cudev::optflow_farneback::gaussianBlur5Gpu_CC11(
-                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
-    swap(M, bufM);
-
-    cudev::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
-
-    if (updateMatrices)
-        cudev::optflow_farneback::updateMatricesGpu(flowx, flowy, R0, R1, M, S(streams[0]));
-}
-
-
-void cv::gpu::FarnebackOpticalFlow::operator ()(
-        const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s)
-{
-    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
-    CV_Assert(frame0.size() == frame1.size());
-    CV_Assert(polyN == 5 || polyN == 7);
-    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
-
-    Stream streams[5];
-    if (S(s))
-        streams[0] = s;
-
-    Size size = frame0.size();
-    GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
-
-    flowx.create(size, CV_32F);
-    flowy.create(size, CV_32F);
-    GpuMat flowx0 = flowx;
-    GpuMat flowy0 = flowy;
-
-    // Crop unnecessary levels
-    double scale = 1;
-    int numLevelsCropped = 0;
-    for (; numLevelsCropped < numLevels; numLevelsCropped++)
-    {
-        scale *= pyrScale;
-        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
-            break;
-    }
-
-    streams[0].enqueueConvert(frame0, frames_[0], CV_32F);
-    streams[1].enqueueConvert(frame1, frames_[1], CV_32F);
-
-    if (fastPyramids)
-    {
-        // Build Gaussian pyramids using pyrDown()
-        pyramid0_.resize(numLevelsCropped + 1);
-        pyramid1_.resize(numLevelsCropped + 1);
-        pyramid0_[0] = frames_[0];
-        pyramid1_[0] = frames_[1];
-        for (int i = 1; i <= numLevelsCropped; ++i)
-        {
-            pyrDown(pyramid0_[i - 1], pyramid0_[i], streams[0]);
-            pyrDown(pyramid1_[i - 1], pyramid1_[i], streams[1]);
-        }
-    }
-
-    setPolynomialExpansionConsts(polyN, polySigma);
-    cudev::optflow_farneback::setUpdateMatricesConsts();
-
-    for (int k = numLevelsCropped; k >= 0; k--)
-    {
-        streams[0].waitForCompletion();
-
-        scale = 1;
-        for (int i = 0; i < k; i++)
-            scale *= pyrScale;
-
-        double sigma = (1./scale - 1) * 0.5;
-        int smoothSize = cvRound(sigma*5) | 1;
-        smoothSize = std::max(smoothSize, 3);
-
-        int width = cvRound(size.width*scale);
-        int height = cvRound(size.height*scale);
-
-        if (fastPyramids)
-        {
-            width = pyramid0_[k].cols;
-            height = pyramid0_[k].rows;
-        }
-
-        if (k > 0)
-        {
-            curFlowX.create(height, width, CV_32F);
-            curFlowY.create(height, width, CV_32F);
-        }
-        else
-        {
-            curFlowX = flowx0;
-            curFlowY = flowy0;
-        }
-
-        if (!prevFlowX.data)
-        {
-            if (flags & OPTFLOW_USE_INITIAL_FLOW)
-            {
-#if ENABLE_GPU_RESIZE
-                resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-                resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-                streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale);
-                streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale);
-#else
-                Mat tmp1, tmp2;
-                flowx0.download(tmp1);
-                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
-                tmp2 *= scale;
-                curFlowX.upload(tmp2);
-                flowy0.download(tmp1);
-                resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_AREA);
-                tmp2 *= scale;
-                curFlowY.upload(tmp2);
-#endif
-            }
-            else
-            {
-                streams[0].enqueueMemSet(curFlowX, 0);
-                streams[1].enqueueMemSet(curFlowY, 0);
-            }
-        }
-        else
-        {
-#if ENABLE_GPU_RESIZE
-            resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
-            resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-            streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale);
-            streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale);
-#else
-            Mat tmp1, tmp2;
-            prevFlowX.download(tmp1);
-            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
-            tmp2 *= 1./pyrScale;
-            curFlowX.upload(tmp2);
-            prevFlowY.download(tmp1);
-            resize(tmp1, tmp2, Size(width, height), 0, 0, INTER_LINEAR);
-            tmp2 *= 1./pyrScale;
-            curFlowY.upload(tmp2);
-#endif
-        }
-
-        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
-        GpuMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
-        GpuMat R[2] =
-        {
-            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
-            allocMatFromBuf(5*height, width, CV_32F, R_[1])
-        };
-
-        if (fastPyramids)
-        {
-            cudev::optflow_farneback::polynomialExpansionGpu(pyramid0_[k], polyN, R[0], S(streams[0]));
-            cudev::optflow_farneback::polynomialExpansionGpu(pyramid1_[k], polyN, R[1], S(streams[1]));
-        }
-        else
-        {
-            GpuMat blurredFrame[2] =
-            {
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
-            };
-            GpuMat pyrLevel[2] =
-            {
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
-            };
-
-            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
-            cudev::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
-
-            for (int i = 0; i < 2; i++)
-            {
-                cudev::optflow_farneback::gaussianBlurGpu(
-                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101_GPU, S(streams[i]));
-#if ENABLE_GPU_RESIZE
-                resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]);
-#else
-                Mat tmp1, tmp2;
-                tmp[i].download(tmp1);
-                resize(tmp1, tmp2, Size(width, height), INTER_LINEAR);
-                I[i].upload(tmp2);
-#endif
-                cudev::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
-            }
-        }
-
-        streams[1].waitForCompletion();
-        cudev::optflow_farneback::updateMatricesGpu(curFlowX, curFlowY, R[0], R[1], M, S(streams[0]));
-
-        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-        {
-            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
-            cudev::optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
-        }
-        for (int i = 0; i < numIters; i++)
-        {
-            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
-            else
-                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1, streams);
-        }
-
-        prevFlowX = curFlowX;
-        prevFlowY = curFlowY;
-    }
-
-    flowx = curFlowX;
-    flowy = curFlowY;
-
-    if (!S(s))
-        streams[0].waitForCompletion();
-}
-
-#endif
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -1,250 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
-void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace pyrlk
-{
-    void loadConstants(int2 winSize, int iters);
-
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-}
-
-cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
-{
-    winSize = Size(21, 21);
-    maxLevel = 3;
-    iters = 30;
-    useInitialFlow = false;
-}
-
-namespace
-{
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
-    {
-        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
-        {
-            block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
-            block.y = 8;
-        }
-        else
-        {
-            block.x = 16;
-            block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
-        }
-
-        patch.x = (winSize.width  + block.x - 1) / block.x;
-        patch.y = (winSize.height + block.y - 1) / block.y;
-
-        block.z = patch.z = 1;
-    }
-}
-
-void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
-{
-    if (prevPts.empty())
-    {
-        nextPts.release();
-        status.release();
-        if (err) err->release();
-        return;
-    }
-
-    dim3 block, patch;
-    calcPatchSize(winSize, block, patch);
-
-    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
-    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
-
-    if (useInitialFlow)
-        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
-    else
-        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
-
-    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
-    GpuMat temp2 = nextPts.reshape(1);
-    multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
-
-    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
-    status.setTo(Scalar::all(1));
-
-    if (err)
-        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
-
-    // build the image pyramids.
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    int cn = prevImg.channels();
-
-    if (cn == 1 || cn == 4)
-    {
-        prevImg.convertTo(prevPyr_[0], CV_32F);
-        nextImg.convertTo(nextPyr_[0], CV_32F);
-    }
-    else
-    {
-        cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(prevPyr_[0], CV_32F);
-
-        cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(nextPyr_[0], CV_32F);
-    }
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
-
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        if (cn == 1)
-        {
-            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
-        }
-        else
-        {
-            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
-        }
-    }
-}
-
-void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
-{
-    CV_Assert(prevImg.type() == CV_8UC1);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-
-    if (err)
-        err->create(prevImg.size(), CV_32FC1);
-
-    // build the image pyramids.
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    prevPyr_[0] = prevImg;
-    nextImg.convertTo(nextPyr_[0], CV_32F);
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
-    uPyr_[0].setTo(Scalar::all(0));
-    vPyr_[0].setTo(Scalar::all(0));
-    uPyr_[1].setTo(Scalar::all(0));
-    vPyr_[1].setTo(Scalar::all(0));
-
-    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    pyrlk::loadConstants(winSize2i, iters);
-
-    PtrStepSzf derr = err ? *err : PtrStepSzf();
-
-    int idx = 0;
-
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        int idx2 = (idx + 1) & 1;
-
-        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-            level == 0 ? derr : PtrStepSzf(), winSize2i);
-
-        if (level > 0)
-            idx = idx2;
-    }
-
-    uPyr_[idx].copyTo(u);
-    vPyr_[idx].copyTo(v);
-}
-
-void cv::gpu::PyrLKOpticalFlow::releaseMemory()
-{
-    prevPyr_.clear();
-    nextPyr_.clear();
-
-    buf_.release();
-
-    uPyr_[0].release();
-    vPyr_[0].release();
-
-    uPyr_[1].release();
-    vPyr_[1].release();
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/tvl1flow.cpp
+++ b/modules/gpu/src/tvl1flow.cpp
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() { throw_no_cuda(); }
-void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage() {}
-void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-
-#else
-
-using namespace cv;
-using namespace cv::gpu;
-
-cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU()
-{
-    tau            = 0.25;
-    lambda         = 0.15;
-    theta          = 0.3;
-    nscales        = 5;
-    warps          = 5;
-    epsilon        = 0.01;
-    iterations     = 300;
-    scaleStep      = 0.8;
-    useInitialFlow = false;
-}
-
-void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy)
-{
-    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
-    CV_Assert( I0.size() == I1.size() );
-    CV_Assert( I0.type() == I1.type() );
-    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
-    CV_Assert( nscales > 0 );
-
-    // allocate memory for the pyramid structure
-    I0s.resize(nscales);
-    I1s.resize(nscales);
-    u1s.resize(nscales);
-    u2s.resize(nscales);
-
-    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
-
-    if (!useInitialFlow)
-    {
-        flowx.create(I0.size(), CV_32FC1);
-        flowy.create(I0.size(), CV_32FC1);
-    }
-
-    u1s[0] = flowx;
-    u2s[0] = flowy;
-
-    I1x_buf.create(I0.size(), CV_32FC1);
-    I1y_buf.create(I0.size(), CV_32FC1);
-
-    I1w_buf.create(I0.size(), CV_32FC1);
-    I1wx_buf.create(I0.size(), CV_32FC1);
-    I1wy_buf.create(I0.size(), CV_32FC1);
-
-    grad_buf.create(I0.size(), CV_32FC1);
-    rho_c_buf.create(I0.size(), CV_32FC1);
-
-    p11_buf.create(I0.size(), CV_32FC1);
-    p12_buf.create(I0.size(), CV_32FC1);
-    p21_buf.create(I0.size(), CV_32FC1);
-    p22_buf.create(I0.size(), CV_32FC1);
-
-    diff_buf.create(I0.size(), CV_32FC1);
-
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
-    {
-        gpu::resize(I0s[s-1], I0s[s], Size(), scaleStep, scaleStep);
-        gpu::resize(I1s[s-1], I1s[s], Size(), scaleStep, scaleStep);
-
-        if (I0s[s].cols < 16 || I0s[s].rows < 16)
-        {
-            nscales = s;
-            break;
-        }
-
-        if (useInitialFlow)
-        {
-            gpu::resize(u1s[s-1], u1s[s], Size(), scaleStep, scaleStep);
-            gpu::resize(u2s[s-1], u2s[s], Size(), scaleStep, scaleStep);
-
-            gpu::multiply(u1s[s], Scalar::all(scaleStep), u1s[s]);
-            gpu::multiply(u2s[s], Scalar::all(scaleStep), u2s[s]);
-        }
-        else
-        {
-            u1s[s].create(I0s[s].size(), CV_32FC1);
-            u2s[s].create(I0s[s].size(), CV_32FC1);
-        }
-    }
-
-    if (!useInitialFlow)
-    {
-        u1s[nscales-1].setTo(Scalar::all(0));
-        u2s[nscales-1].setTo(Scalar::all(0));
-    }
-
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
-
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
-
-        // otherwise, upsample the optical flow
-
-        // zoom the optical flow for the next finer scale
-        gpu::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
-        gpu::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
-
-        // scale the optical flow with the appropriate zoom factor
-        gpu::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
-        gpu::multiply(u2s[s - 1], Scalar::all(1/scaleStep), u2s[s - 1]);
-    }
-}
-
-namespace tvl1flow
-{
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy);
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho);
-    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
-                   PtrStepSzf grad, PtrStepSzf rho_c,
-                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
-                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
-                   float l_t, float theta);
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut);
-}
-
-void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2)
-{
-    using namespace tvl1flow;
-
-    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
-
-    CV_DbgAssert( I1.size() == I0.size() );
-    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == u1.size() );
-
-    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-    centeredGradient(I1, I1x, I1y);
-
-    GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-
-    GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
-
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
-
-        double error = std::numeric_limits<double>::max();
-        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
-        {
-            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));
-
-            error = gpu::sum(diff, norm_buf)[0];
-
-            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
-        }
-    }
-}
-
-void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage()
-{
-    I0s.clear();
-    I1s.clear();
-    u1s.clear();
-    u2s.clear();
-
-    I1x_buf.release();
-    I1y_buf.release();
-
-    I1w_buf.release();
-    I1wx_buf.release();
-    I1wy_buf.release();
-
-    grad_buf.release();
-    rho_c_buf.release();
-
-    p11_buf.release();
-    p12_buf.release();
-    p21_buf.release();
-    p22_buf.release();
-
-    diff_buf.release();
-    norm_buf.release();
-}
-
-#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER)
--- a/modules/gpu/test/test_bgfg.cpp
+++ b/modules/gpu/test/test_bgfg.cpp
@@ -1,405 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include "opencv2/legacy.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-#if defined(HAVE_XINE)         || \
-    defined(HAVE_GSTREAMER)    || \
-    defined(HAVE_QUICKTIME)    || \
-    defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
-
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
-#else
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
-#endif
-
-//////////////////////////////////////////////////////
-// FGDStatModel
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
-
-namespace cv
-{
-    template<> void Ptr<CvBGStatModel>::delete_obj()
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-PARAM_TEST_CASE(FGDStatModel, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string inputFile;
-    int out_cn;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
-        out_cn = GET_PARAM(2);
-    }
-};
-
-GPU_TEST_P(FGDStatModel, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    cv::gpu::GpuMat d_frame(frame);
-    cv::gpu::FGDStatModel d_model(out_cn);
-    d_model.create(d_frame);
-
-    cv::Mat h_background;
-    cv::Mat h_foreground;
-    cv::Mat h_background3;
-
-    cv::Mat backgroundDiff;
-    cv::Mat foregroundDiff;
-
-    for (int i = 0; i < 5; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-        int gold_count = cvUpdateBGStatModel(&ipl_frame, model);
-
-        d_frame.upload(frame);
-
-        int count = d_model.update(d_frame);
-
-        ASSERT_EQ(gold_count, count);
-
-        cv::Mat gold_background = cv::cvarrToMat(model->background);
-        cv::Mat gold_foreground = cv::cvarrToMat(model->foreground);
-
-        if (out_cn == 3)
-            d_model.background.download(h_background3);
-        else
-        {
-            d_model.background.download(h_background);
-            cv::cvtColor(h_background, h_background3, cv::COLOR_BGRA2BGR);
-        }
-        d_model.foreground.download(h_foreground);
-
-        ASSERT_MAT_NEAR(gold_background, h_background3, 1.0);
-        ASSERT_MAT_NEAR(gold_foreground, h_foreground, 0.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi")),
-    testing::Values(Channels(3), Channels(4))));
-
-#endif
-
-//////////////////////////////////////////////////////
-// MOG
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(UseGray, bool)
-    IMPLEMENT_PARAM_CLASS(LearningRate, double)
-}
-
-PARAM_TEST_CASE(MOG, cv::gpu::DeviceInfo, std::string, UseGray, LearningRate, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string inputFile;
-    bool useGray;
-    double learningRate;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
-        useGray = GET_PARAM(2);
-
-        learningRate = GET_PARAM(3);
-
-        useRoi = GET_PARAM(4);
-    }
-};
-
-GPU_TEST_P(MOG, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    cv::gpu::MOG_GPU mog;
-    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
-
-    cv::Ptr<cv::BackgroundSubtractorMOG> mog_gold = cv::createBackgroundSubtractorMOG();
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (useGray)
-        {
-            cv::Mat temp;
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            cv::swap(temp, frame);
-        }
-
-        mog(loadMat(frame, useRoi), foreground, (float)learningRate);
-
-        mog_gold->apply(frame, foreground_gold, learningRate);
-
-        ASSERT_MAT_NEAR(foreground_gold, foreground, 0.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi")),
-    testing::Values(UseGray(true), UseGray(false)),
-    testing::Values(LearningRate(0.0), LearningRate(0.01)),
-    WHOLE_SUBMAT));
-
-#endif
-
-//////////////////////////////////////////////////////
-// MOG2
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
-}
-
-PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, DetectShadow, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    std::string inputFile;
-    bool useGray;
-    bool detectShadow;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-        useGray = GET_PARAM(2);
-        detectShadow = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-    }
-};
-
-GPU_TEST_P(MOG2, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    cv::gpu::MOG2_GPU mog2;
-    mog2.bShadowDetection = detectShadow;
-    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);
-
-    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2_gold = cv::createBackgroundSubtractorMOG2();
-    mog2_gold->setDetectShadows(detectShadow);
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (useGray)
-        {
-            cv::Mat temp;
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            cv::swap(temp, frame);
-        }
-
-        mog2(loadMat(frame, useRoi), foreground);
-
-        mog2_gold->apply(frame, foreground_gold);
-
-        if (detectShadow)
-        {
-            ASSERT_MAT_SIMILAR(foreground_gold, foreground, 1e-2);
-        }
-        else
-        {
-            ASSERT_MAT_NEAR(foreground_gold, foreground, 0);
-        }
-    }
-}
-
-GPU_TEST_P(MOG2, getBackgroundImage)
-{
-    if (useGray)
-        return;
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::gpu::MOG2_GPU mog2;
-    mog2.bShadowDetection = detectShadow;
-    cv::gpu::GpuMat foreground;
-
-    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2_gold = cv::createBackgroundSubtractorMOG2();
-    mog2_gold->setDetectShadows(detectShadow);
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        mog2(loadMat(frame, useRoi), foreground);
-
-        mog2_gold->apply(frame, foreground_gold);
-    }
-
-    cv::gpu::GpuMat background = createMat(frame.size(), frame.type(), useRoi);
-    mog2.getBackgroundImage(background);
-
-    cv::Mat background_gold;
-    mog2_gold->getBackgroundImage(background_gold);
-
-    ASSERT_MAT_NEAR(background_gold, background, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi")),
-    testing::Values(UseGray(true), UseGray(false)),
-    testing::Values(DetectShadow(true), DetectShadow(false)),
-    WHOLE_SUBMAT));
-
-#endif
-
-//////////////////////////////////////////////////////
-// GMG
-
-PARAM_TEST_CASE(GMG, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
-{
-};
-
-GPU_TEST_P(GMG, Accuracy)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const int depth = GET_PARAM(2);
-    const int channels = GET_PARAM(3);
-    const bool useRoi = GET_PARAM(4);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    const cv::Mat zeros(size, CV_8UC1, cv::Scalar::all(0));
-    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
-
-    cv::Mat frame = randomMat(size, type, 0, 100);
-    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
-
-    cv::gpu::GMG_GPU gmg;
-    gmg.numInitializationFrames = 5;
-    gmg.smoothingRadius = 0;
-    gmg.initialize(d_frame.size(), 0, 255);
-
-    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
-
-    for (int i = 0; i < gmg.numInitializationFrames; ++i)
-    {
-        gmg(d_frame, d_fgmask);
-
-        // fgmask should be entirely background during training
-        ASSERT_MAT_NEAR(zeros, d_fgmask, 0);
-    }
-
-    frame = randomMat(size, type, 160, 255);
-    d_frame = loadMat(frame, useRoi);
-    gmg(d_frame, d_fgmask);
-
-    // now fgmask should be entirely foreground
-    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, GMG, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8U), MatType(CV_16U), MatType(CV_32F)),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
@@ -1,630 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include "opencv2/legacy.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-//////////////////////////////////////////////////////
-// BroxOpticalFlow
-
-//#define BROX_DUMP
-
-struct BroxOpticalFlow : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(BroxOpticalFlow, Regression)
-{
-    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::gpu::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  10 /*inner_iterations*/, 77 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-    cv::gpu::GpuMat u;
-    cv::gpu::GpuMat v;
-    brox(loadMat(frame0), loadMat(frame1), u, v);
-
-    std::string fname(cvtest::TS::ptr()->get_data_path());
-    if (devInfo.majorVersion() >= 2)
-        fname += "opticalflow/brox_optical_flow_cc20.bin";
-    else
-        fname += "opticalflow/brox_optical_flow.bin";
-
-#ifndef BROX_DUMP
-    std::ifstream f(fname.c_str(), std::ios_base::binary);
-
-    int rows, cols;
-
-    f.read((char*) &rows, sizeof(rows));
-    f.read((char*) &cols, sizeof(cols));
-
-    cv::Mat u_gold(rows, cols, CV_32FC1);
-
-    for (int i = 0; i < u_gold.rows; ++i)
-        f.read(u_gold.ptr<char>(i), u_gold.cols * sizeof(float));
-
-    cv::Mat v_gold(rows, cols, CV_32FC1);
-
-    for (int i = 0; i < v_gold.rows; ++i)
-        f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));
-
-    EXPECT_MAT_NEAR(u_gold, u, 0);
-    EXPECT_MAT_NEAR(v_gold, v, 0);
-#else
-    std::ofstream f(fname.c_str(), std::ios_base::binary);
-
-    f.write((char*) &u.rows, sizeof(u.rows));
-    f.write((char*) &u.cols, sizeof(u.cols));
-
-    cv::Mat h_u(u);
-    cv::Mat h_v(v);
-
-    for (int i = 0; i < u.rows; ++i)
-        f.write(h_u.ptr<char>(i), u.cols * sizeof(float));
-
-    for (int i = 0; i < v.rows; ++i)
-        f.write(h_v.ptr<char>(i), v.cols * sizeof(float));
-#endif
-}
-
-GPU_TEST_P(BroxOpticalFlow, OpticalFlowNan)
-{
-    cv::Mat frame0 = readImageType("opticalflow/frame0.png", CV_32FC1);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImageType("opticalflow/frame1.png", CV_32FC1);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat r_frame0, r_frame1;
-    cv::resize(frame0, r_frame0, cv::Size(1380,1000));
-    cv::resize(frame1, r_frame1, cv::Size(1380,1000));
-
-    cv::gpu::BroxOpticalFlow brox(0.197f /*alpha*/, 50.0f /*gamma*/, 0.8f /*scale_factor*/,
-                                  5 /*inner_iterations*/, 150 /*outer_iterations*/, 10 /*solver_iterations*/);
-
-    cv::gpu::GpuMat u;
-    cv::gpu::GpuMat v;
-    brox(loadMat(r_frame0), loadMat(r_frame1), u, v);
-
-    cv::Mat h_u, h_v;
-    u.download(h_u);
-    v.download(h_v);
-
-    EXPECT_TRUE(cv::checkRange(h_u));
-    EXPECT_TRUE(cv::checkRange(h_v));
-};
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, BroxOpticalFlow, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// GoodFeaturesToTrack
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(MinDistance, double)
-}
-
-PARAM_TEST_CASE(GoodFeaturesToTrack, cv::gpu::DeviceInfo, MinDistance)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double minDistance;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        minDistance = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(GoodFeaturesToTrack, Accuracy)
-{
-    cv::Mat image = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    int maxCorners = 1000;
-    double qualityLevel = 0.01;
-
-    cv::gpu::GoodFeaturesToTrackDetector_GPU detector(maxCorners, qualityLevel, minDistance);
-
-    cv::gpu::GpuMat d_pts;
-    detector(loadMat(image), d_pts);
-
-    ASSERT_FALSE(d_pts.empty());
-
-    std::vector<cv::Point2f> pts(d_pts.cols);
-    cv::Mat pts_mat(1, d_pts.cols, CV_32FC2, (void*) &pts[0]);
-    d_pts.download(pts_mat);
-
-    std::vector<cv::Point2f> pts_gold;
-    cv::goodFeaturesToTrack(image, pts_gold, maxCorners, qualityLevel, minDistance);
-
-    ASSERT_EQ(pts_gold.size(), pts.size());
-
-    size_t mistmatch = 0;
-    for (size_t i = 0; i < pts.size(); ++i)
-    {
-        cv::Point2i a = pts_gold[i];
-        cv::Point2i b = pts[i];
-
-        bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
-
-        if (!eq)
-            ++mistmatch;
-    }
-
-    double bad_ratio = static_cast<double>(mistmatch) / pts.size();
-
-    ASSERT_LE(bad_ratio, 0.01);
-}
-
-GPU_TEST_P(GoodFeaturesToTrack, EmptyCorners)
-{
-    int maxCorners = 1000;
-    double qualityLevel = 0.01;
-
-    cv::gpu::GoodFeaturesToTrackDetector_GPU detector(maxCorners, qualityLevel, minDistance);
-
-    cv::gpu::GpuMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
-    cv::gpu::GpuMat corners(1, maxCorners, CV_32FC2);
-
-    detector(src, corners);
-
-    ASSERT_TRUE(corners.empty());
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, GoodFeaturesToTrack, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MinDistance(0.0), MinDistance(3.0))));
-
-//////////////////////////////////////////////////////
-// PyrLKOpticalFlow
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(UseGray, bool)
-}
-
-PARAM_TEST_CASE(PyrLKOpticalFlow, cv::gpu::DeviceInfo, UseGray)
-{
-    cv::gpu::DeviceInfo devInfo;
-    bool useGray;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        useGray = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(PyrLKOpticalFlow, Sparse)
-{
-    cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat gray_frame;
-    if (useGray)
-        gray_frame = frame0;
-    else
-        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
-
-    std::vector<cv::Point2f> pts;
-    cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
-
-    cv::gpu::GpuMat d_pts;
-    cv::Mat pts_mat(1, (int) pts.size(), CV_32FC2, (void*) &pts[0]);
-    d_pts.upload(pts_mat);
-
-    cv::gpu::PyrLKOpticalFlow pyrLK;
-
-    cv::gpu::GpuMat d_nextPts;
-    cv::gpu::GpuMat d_status;
-    pyrLK.sparse(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
-
-    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
-    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*) &nextPts[0]);
-    d_nextPts.download(nextPts_mat);
-
-    std::vector<unsigned char> status(d_status.cols);
-    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*) &status[0]);
-    d_status.download(status_mat);
-
-    std::vector<cv::Point2f> nextPts_gold;
-    std::vector<unsigned char> status_gold;
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
-
-    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
-    ASSERT_EQ(status_gold.size(), status.size());
-
-    size_t mistmatch = 0;
-    for (size_t i = 0; i < nextPts.size(); ++i)
-    {
-        cv::Point2i a = nextPts[i];
-        cv::Point2i b = nextPts_gold[i];
-
-        if (status[i] != status_gold[i])
-        {
-            ++mistmatch;
-            continue;
-        }
-
-        if (status[i])
-        {
-            bool eq = std::abs(a.x - b.x) <= 1 && std::abs(a.y - b.y) <= 1;
-
-            if (!eq)
-                ++mistmatch;
-        }
-    }
-
-    double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
-
-    ASSERT_LE(bad_ratio, 0.01);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, PyrLKOpticalFlow, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(UseGray(true), UseGray(false))));
-
-//////////////////////////////////////////////////////
-// FarnebackOpticalFlow
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(PyrScale, double)
-    IMPLEMENT_PARAM_CLASS(PolyN, int)
-    CV_FLAGS(FarnebackOptFlowFlags, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
-    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
-}
-
-PARAM_TEST_CASE(FarnebackOpticalFlow, cv::gpu::DeviceInfo, PyrScale, PolyN, FarnebackOptFlowFlags, UseInitFlow)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double pyrScale;
-    int polyN;
-    int flags;
-    bool useInitFlow;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        pyrScale = GET_PARAM(1);
-        polyN = GET_PARAM(2);
-        flags = GET_PARAM(3);
-        useInitFlow = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(FarnebackOpticalFlow, Accuracy)
-{
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    double polySigma = polyN <= 5 ? 1.1 : 1.5;
-
-    cv::gpu::FarnebackOpticalFlow farn;
-    farn.pyrScale = pyrScale;
-    farn.polyN = polyN;
-    farn.polySigma = polySigma;
-    farn.flags = flags;
-
-    cv::gpu::GpuMat d_flowx, d_flowy;
-    farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
-
-    cv::Mat flow;
-    if (useInitFlow)
-    {
-        cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
-        cv::merge(flowxy, 2, flow);
-
-        farn.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
-        farn(loadMat(frame0), loadMat(frame1), d_flowx, d_flowy);
-    }
-
-    cv::calcOpticalFlowFarneback(
-        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-
-    std::vector<cv::Mat> flowxy;
-    cv::split(flow, flowxy);
-
-    EXPECT_MAT_SIMILAR(flowxy[0], d_flowx, 0.1);
-    EXPECT_MAT_SIMILAR(flowxy[1], d_flowy, 0.1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, FarnebackOpticalFlow, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(PyrScale(0.3), PyrScale(0.5), PyrScale(0.8)),
-    testing::Values(PolyN(5), PolyN(7)),
-    testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
-    testing::Values(UseInitFlow(false), UseInitFlow(true))));
-
-//////////////////////////////////////////////////////
-// OpticalFlowDual_TVL1
-
-PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::gpu::DeviceInfo, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(OpticalFlowDual_TVL1, Accuracy)
-{
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::gpu::OpticalFlowDual_TVL1_GPU d_alg;
-    cv::gpu::GpuMat d_flowx = createMat(frame0.size(), CV_32FC1, useRoi);
-    cv::gpu::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi);
-    d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy);
-
-    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
-    alg->set("medianFiltering", 1);
-    alg->set("innerIterations", 1);
-    alg->set("outerIterations", d_alg.iterations);
-    cv::Mat flow;
-    alg->calc(frame0, frame1, flow);
-    cv::Mat gold[2];
-    cv::split(flow, gold);
-
-    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 4e-3);
-    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 4e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowDual_TVL1, testing::Combine(
-    ALL_DEVICES,
-    WHOLE_SUBMAT));
-
-//////////////////////////////////////////////////////
-// OpticalFlowBM
-
-namespace
-{
-    void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
-                           cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
-                           cv::Mat& velx, cv::Mat& vely)
-    {
-        cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
-
-        velx.create(sz, CV_32FC1);
-        vely.create(sz, CV_32FC1);
-
-        CvMat cvprev = prev;
-        CvMat cvcurr = curr;
-
-        CvMat cvvelx = velx;
-        CvMat cvvely = vely;
-
-        cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
-    }
-}
-
-struct OpticalFlowBM : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-};
-
-GPU_TEST_P(OpticalFlowBM, Accuracy)
-{
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Size block_size(16, 16);
-    cv::Size shift_size(1, 1);
-    cv::Size max_range(16, 16);
-
-    cv::gpu::GpuMat d_velx, d_vely, buf;
-    cv::gpu::calcOpticalFlowBM(loadMat(frame0), loadMat(frame1),
-                               block_size, shift_size, max_range, false,
-                               d_velx, d_vely, buf);
-
-    cv::Mat velx, vely;
-    calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
-
-    EXPECT_MAT_NEAR(velx, d_velx, 0);
-    EXPECT_MAT_NEAR(vely, d_vely, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowBM, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// FastOpticalFlowBM
-
-namespace
-{
-    void FastOpticalFlowBM_gold(const cv::Mat_<uchar>& I0, const cv::Mat_<uchar>& I1, cv::Mat_<float>& velx, cv::Mat_<float>& vely, int search_window, int block_window)
-    {
-        velx.create(I0.size());
-        vely.create(I0.size());
-
-        int search_radius = search_window / 2;
-        int block_radius = block_window / 2;
-
-        for (int y = 0; y < I0.rows; ++y)
-        {
-            for (int x = 0; x < I0.cols; ++x)
-            {
-                int bestDist = std::numeric_limits<int>::max();
-                int bestDx = 0;
-                int bestDy = 0;
-
-                for (int dy = -search_radius; dy <= search_radius; ++dy)
-                {
-                    for (int dx = -search_radius; dx <= search_radius; ++dx)
-                    {
-                        int dist = 0;
-
-                        for (int by = -block_radius; by <= block_radius; ++by)
-                        {
-                            for (int bx = -block_radius; bx <= block_radius; ++bx)
-                            {
-                                int I0_val = I0(cv::borderInterpolate(y + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + bx, I0.cols, cv::BORDER_DEFAULT));
-                                int I1_val = I1(cv::borderInterpolate(y + dy + by, I0.rows, cv::BORDER_DEFAULT), cv::borderInterpolate(x + dx + bx, I0.cols, cv::BORDER_DEFAULT));
-
-                                dist += std::abs(I0_val - I1_val);
-                            }
-                        }
-
-                        if (dist < bestDist)
-                        {
-                            bestDist = dist;
-                            bestDx = dx;
-                            bestDy = dy;
-                        }
-                    }
-                }
-
-                velx(y, x) = (float) bestDx;
-                vely(y, x) = (float) bestDy;
-            }
-        }
-    }
-
-    double calc_rmse(const cv::Mat_<float>& flow1, const cv::Mat_<float>& flow2)
-    {
-        double sum = 0.0;
-
-        for (int y = 0; y < flow1.rows; ++y)
-        {
-            for (int x = 0; x < flow1.cols; ++x)
-            {
-                double diff = flow1(y, x) - flow2(y, x);
-                sum += diff * diff;
-            }
-        }
-
-        return std::sqrt(sum / flow1.size().area());
-    }
-}
-
-struct FastOpticalFlowBM : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-};
-
-GPU_TEST_P(FastOpticalFlowBM, Accuracy)
-{
-    const double MAX_RMSE = 0.6;
-
-    int search_window = 15;
-    int block_window = 5;
-
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Size smallSize(320, 240);
-    cv::Mat frame0_small;
-    cv::Mat frame1_small;
-
-    cv::resize(frame0, frame0_small, smallSize);
-    cv::resize(frame1, frame1_small, smallSize);
-
-    cv::gpu::GpuMat d_flowx;
-    cv::gpu::GpuMat d_flowy;
-    cv::gpu::FastOpticalFlowBM fastBM;
-
-    fastBM(loadMat(frame0_small), loadMat(frame1_small), d_flowx, d_flowy, search_window, block_window);
-
-    cv::Mat_<float> flowx;
-    cv::Mat_<float> flowy;
-    FastOpticalFlowBM_gold(frame0_small, frame1_small, flowx, flowy, search_window, block_window);
-
-    double err;
-
-    err = calc_rmse(flowx, cv::Mat(d_flowx));
-    EXPECT_LE(err, MAX_RMSE);
-
-    err = calc_rmse(flowy, cv::Mat(d_flowy));
-    EXPECT_LE(err, MAX_RMSE);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, FastOpticalFlowBM, ALL_DEVICES);
-
-#endif // HAVE_CUDA