gpucalib3d module for camera calibration and stereo correspondence

2013-04-17 17:51:19 +04:00
parent fc1fa28556
commit b08b9ab83b
32 changed files with 932 additions and 460 deletions
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 set(the_description "GPU-accelerated Computer Vision")

 ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy
-                   opencv_gpuarithm opencv_gpufilters opencv_gpuimgproc opencv_gpufeatures2d opencv_gpuvideo
+                   opencv_gpuarithm opencv_gpufilters opencv_gpuimgproc opencv_gpufeatures2d opencv_gpuvideo opencv_gpucalib3d
                   OPTIONAL opencv_gpunvidia)

 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
--- a/modules/gpu/doc/camera_calibration_and_3d_reconstruction.rst
+++ b/modules/gpu/doc/camera_calibration_and_3d_reconstruction.rst
@@ -1,499 +0,0 @@
-Camera Calibration and 3D Reconstruction
-========================================
-
-.. highlight:: cpp
-
-
-
-gpu::StereoBM_GPU
-----------------
-.. ocv:class:: gpu::StereoBM_GPU
-
-Class computing stereo correspondence (disparity map) using the block matching algorithm. ::
-
-    class StereoBM_GPU
-    {
-    public:
-        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
-
-        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-        StereoBM_GPU();
-        StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP,
-                     int winSize = DEFAULT_WINSZ);
-
-        void operator() (const GpuMat& left, const GpuMat& right,
-                         GpuMat& disparity, Stream& stream = Stream::Null());
-
-        static bool checkIfGpuCallReasonable();
-
-        int preset;
-        int ndisp;
-        int winSize;
-
-        float avergeTexThreshold;
-
-        ...
-    };
-
-
-The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
-
-.. math::
-    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
-
-This means that the input left image is low textured.
-
-
-
-gpu::StereoBM_GPU::StereoBM_GPU
-----------------------------------
-Enables :ocv:class:`gpu::StereoBM_GPU` constructors.
-
-.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU()
-
-.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
-
-    :param preset: Parameter presetting:
-
-        * **BASIC_PRESET** Basic mode without pre-processing.
-
-        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
-
-    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
-
-    :param winSize: Block size.
-
-
-
-gpu::StereoBM_GPU::operator ()
----------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void gpu::StereoBM_GPU::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. Only  ``CV_8UC1``  type is supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::StereoBM_GPU::checkIfGpuCallReasonable
-----------------------------------------------
-Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
-
-.. ocv:function:: bool gpu::StereoBM_GPU::checkIfGpuCallReasonable()
-
-
-
-gpu::StereoBeliefPropagation
----------------------------
-.. ocv:class:: gpu::StereoBeliefPropagation
-
-Class computing stereo correspondence using the belief propagation algorithm. ::
-
-    class StereoBeliefPropagation
-    {
-    public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_ITERS  = 5  };
-        enum { DEFAULT_LEVELS = 5  };
-
-        static void estimateRecommendedParams(int width, int height,
-            int& ndisp, int& iters, int& levels);
-
-        explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP,
-            int iters  = DEFAULT_ITERS,
-            int levels = DEFAULT_LEVELS,
-            int msg_type = CV_32F);
-        StereoBeliefPropagation(int ndisp, int iters, int levels,
-            float max_data_term, float data_weight,
-            float max_disc_term, float disc_single_jump,
-            int msg_type = CV_32F);
-
-        void operator()(const GpuMat& left, const GpuMat& right,
-                        GpuMat& disparity, Stream& stream = Stream::Null());
-        void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
-
-        int ndisp;
-
-        int iters;
-        int levels;
-
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
-
-        int msg_type;
-
-        ...
-    };
-
-The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.
-
-.. note::
-
-    ``StereoBeliefPropagation`` requires a lot of memory for message storage:
-
-    .. math::
-
-        width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)
-
-    and for data cost storage:
-
-    .. math::
-
-        width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})
-
-    ``width_step`` is the number of bytes in a line including padding.
-
-
-
-gpu::StereoBeliefPropagation::StereoBeliefPropagation
---------------------------------------------------------
-Enables the :ocv:class:`gpu::StereoBeliefPropagation` constructors.
-
-.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_32F)
-
-.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param max_data_term: Threshold for data cost truncation.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Threshold for discontinuity truncation.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
-``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:
-
-.. math::
-
-    DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)
-
-.. math::
-
-    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
-
-For more details, see [Felzenszwalb2006]_.
-
-By default, :ocv:class:`gpu::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
-
-.. math::
-
-    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
-
-
-
-gpu::StereoBeliefPropagation::estimateRecommendedParams
-----------------------------------------------------------
-Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).
-
-.. ocv:function:: void gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
-
-
-
-gpu::StereoBeliefPropagation::operator ()
---------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
-
-.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.
-
-    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the type is retained.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::StereoConstantSpaceBP
--------------------------
-.. ocv:class:: gpu::StereoConstantSpaceBP
-
-Class computing stereo correspondence using the constant space belief propagation algorithm. ::
-
-    class StereoConstantSpaceBP
-    {
-    public:
-        enum { DEFAULT_NDISP    = 128 };
-        enum { DEFAULT_ITERS    = 8   };
-        enum { DEFAULT_LEVELS   = 4   };
-        enum { DEFAULT_NR_PLANE = 4   };
-
-        static void estimateRecommendedParams(int width, int height,
-            int& ndisp, int& iters, int& levels, int& nr_plane);
-
-        explicit StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP,
-            int iters    = DEFAULT_ITERS,
-            int levels   = DEFAULT_LEVELS,
-            int nr_plane = DEFAULT_NR_PLANE,
-            int msg_type = CV_32F);
-        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-            float max_data_term, float data_weight,
-            float max_disc_term, float disc_single_jump,
-            int min_disp_th = 0,
-            int msg_type = CV_32F);
-
-        void operator()(const GpuMat& left, const GpuMat& right,
-                        GpuMat& disparity, Stream& stream = Stream::Null());
-
-        int ndisp;
-
-        int iters;
-        int levels;
-
-        int nr_plane;
-
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
-
-        int min_disp_th;
-
-        int msg_type;
-
-        bool use_local_init_data_cost;
-
-        ...
-    };
-
-
-The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .
-
-
-
-gpu::StereoConstantSpaceBP::StereoConstantSpaceBP
-----------------------------------------------------
-Enables the :ocv:class:`gpu::StereoConstantSpaceBP` constructors.
-
-.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
-
-.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param nr_plane: Number of disparity levels on the first level.
-
-    :param max_data_term: Truncation of data cost.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Truncation of discontinuity.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param min_disp_th: Minimal disparity threshold.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
-``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:
-
-.. math::
-
-    DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)
-
-.. math::
-
-    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
-
-For more details, see [Yang2010]_.
-
-By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
-
-.. math::
-
-    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
-
-
-
-gpu::StereoConstantSpaceBP::estimateRecommendedParams
---------------------------------------------------------
-Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).
-
-.. ocv:function:: void gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
-
-
-
-gpu::StereoConstantSpaceBP::operator ()
-------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void gpu::StereoConstantSpaceBP::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::DisparityBilateralFilter
-----------------------------
-.. ocv:class:: gpu::DisparityBilateralFilter
-
-Class refining a disparity map using joint bilateral filtering. ::
-
-    class CV_EXPORTS DisparityBilateralFilter
-    {
-    public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_RADIUS = 3 };
-        enum { DEFAULT_ITERS  = 1 };
-
-        explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP,
-            int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
-
-        DisparityBilateralFilter(int ndisp, int radius, int iters,
-            float edge_threshold, float max_disc_threshold,
-            float sigma_range);
-
-        void operator()(const GpuMat& disparity, const GpuMat& image,
-                        GpuMat& dst, Stream& stream = Stream::Null());
-
-        ...
-    };
-
-
-The class implements [Yang2010]_ algorithm.
-
-
-
-gpu::DisparityBilateralFilter::DisparityBilateralFilter
-----------------------------------------------------------
-Enables the :ocv:class:`gpu::DisparityBilateralFilter` constructors.
-
-.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS)
-
-.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range)
-
-    :param ndisp: Number of disparities.
-
-    :param radius: Filter radius.
-
-    :param iters: Number of iterations.
-
-    :param edge_threshold: Threshold for edges.
-
-    :param max_disc_threshold: Constant to reject outliers.
-
-    :param sigma_range: Filter range.
-
-
-
-gpu::DisparityBilateralFilter::operator ()
----------------------------------------------
-Refines a disparity map using joint bilateral filtering.
-
-.. ocv:function:: void gpu::DisparityBilateralFilter::operator ()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param disparity: Input disparity map.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
-
-    :param image: Input image. ``CV_8UC1``  and  ``CV_8UC3``  types are supported.
-
-    :param dst: Destination disparity map. It has the same size and type as  ``disparity`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::drawColorDisp
----------------------
-Colors a disparity image.
-
-.. ocv:function:: void gpu::drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null())
-
-    :param src_disp: Source disparity image.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
-
-    :param dst_disp: Output disparity image. It has the same size as  ``src_disp`` . The  type is ``CV_8UC4``  in  ``BGRA``  format (alpha = 255).
-
-    :param ndisp: Number of disparities.
-
-    :param stream: Stream for the asynchronous version.
-
-This function draws a colored disparity map by converting disparity values from ``[0..ndisp)`` interval first to ``HSV`` color space (where different disparity values correspond to different hues) and then converting the pixels to ``RGB`` for visualization.
-
-
-
-gpu::reprojectImageTo3D
---------------------------
-Reprojects a disparity image to 3D space.
-
-.. ocv:function:: void gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null())
-
-    :param disp: Input disparity image.  ``CV_8U``  and  ``CV_16S``  types are supported.
-
-    :param xyzw: Output 3- or 4-channel floating-point image of the same size as  ``disp`` . Each element of  ``xyzw(x,y)``  contains 3D coordinates ``(x,y,z)`` or ``(x,y,z,1)``  of the point  ``(x,y)`` , computed from the disparity map.
-
-    :param Q: :math:`4 \times 4`  perspective transformation matrix that can be obtained via  :ocv:func:`stereoRectify` .
-
-    :param dst_cn: The number of channels for output image. Can be 3 or 4.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`reprojectImageTo3D`
-
-
-
-gpu::solvePnPRansac
-------------------
-Finds the object pose from 3D-2D point correspondences.
-
-.. ocv:function:: void gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, int num_iters=100, float max_dist=8.0, int min_inlier_count=100, vector<int>* inliers=NULL)
-
-    :param object: Single-row matrix of object points.
-
-    :param image: Single-row matrix of image points.
-
-    :param camera_mat: 3x3 matrix of intrinsic camera parameters.
-
-    :param dist_coef: Distortion coefficients. See :ocv:func:`undistortPoints` for details.
-
-    :param rvec: Output 3D rotation vector.
-
-    :param tvec: Output 3D translation vector.
-
-    :param use_extrinsic_guess: Flag to indicate that the function must use ``rvec`` and ``tvec`` as an initial transformation guess. It is not supported for now.
-
-    :param num_iters: Maximum number of RANSAC iterations.
-
-    :param max_dist: Euclidean distance threshold to detect whether point is inlier or not.
-
-    :param min_inlier_count: Flag to indicate that the function must stop if greater or equal number of inliers is achieved. It is not supported for now.
-
-    :param inliers: Output vector of inlier indices.
-
-.. seealso:: :ocv:func:`solvePnPRansac`
-
-
-
-.. [Felzenszwalb2006] Pedro F. Felzenszwalb algorithm [Pedro F. Felzenszwalb and Daniel P. Huttenlocher. *Efficient belief propagation for early vision*. International Journal of Computer Vision, 70(1), October 2006
-
-.. [Yang2010] Q. Yang, L. Wang, and N. Ahuja. *A constant-space belief propagation algorithm for stereo matching*. In CVPR, 2010.
--- a/modules/gpu/doc/gpu.rst
+++ b/modules/gpu/doc/gpu.rst
@@ -9,4 +9,3 @@ gpu. GPU-accelerated Computer Vision
    initalization_and_information
    data_structures
    object_detection
-    camera_calibration_and_3d_reconstruction
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -55,6 +55,7 @@
 #include "opencv2/gpuimgproc.hpp"
 #include "opencv2/gpufeatures2d.hpp"
 #include "opencv2/gpuvideo.hpp"
+#include "opencv2/gpucalib3d.hpp"

 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
@@ -68,18 +69,6 @@ namespace cv { namespace gpu {

 ///////////////////////////// Calibration 3D //////////////////////////////////

-CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
-                                GpuMat& dst, Stream& stream = Stream::Null());
-
-CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
-                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
-                              Stream& stream = Stream::Null());
-
-CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
-                               const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
-                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100,
-                               std::vector<int>* inliers=NULL);
-
 //////////////////////////////// Image Labeling ////////////////////////////////


@@ -90,190 +79,17 @@ CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& c

 //////////////////////////////// StereoBM_GPU ////////////////////////////////

-class CV_EXPORTS StereoBM_GPU
-{
-public:
-    enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };

-    enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-    //! the default constructor
-    StereoBM_GPU();
-    //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
-    StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
-    //! Output disparity has CV_8U type.
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    //! Some heuristics that tries to estmate
-    // if current GPU will be faster than CPU in this algorithm.
-    // It queries current active device.
-    static bool checkIfGpuCallReasonable();
-
-    int preset;
-    int ndisp;
-    int winSize;
-
-    // If avergeTexThreshold  == 0 => post procesing is disabled
-    // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
-    // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
-    // i.e. input left image is low textured.
-    float avergeTexThreshold;
-
-private:
-    GpuMat minSSD, leBuf, riBuf;
-};

 ////////////////////////// StereoBeliefPropagation ///////////////////////////
-// "Efficient Belief Propagation for Early Vision"
-// P.Felzenszwalb

-class CV_EXPORTS StereoBeliefPropagation
-{
-public:
-    enum { DEFAULT_NDISP  = 64 };
-    enum { DEFAULT_ITERS  = 5  };
-    enum { DEFAULT_LEVELS = 5  };
-
-    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
-
-    //! the default constructor
-    explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
-                                     int iters  = DEFAULT_ITERS,
-                                     int levels = DEFAULT_LEVELS,
-                                     int msg_type = CV_32F);
-
-    //! the full constructor taking the number of disparities, number of BP iterations on each level,
-    //! number of levels, truncation of data cost, data weight,
-    //! truncation of discontinuity cost and discontinuity single jump
-    //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term)
-    //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term)
-    //! please see paper for more details
-    StereoBeliefPropagation(int ndisp, int iters, int levels,
-        float max_data_term, float data_weight,
-        float max_disc_term, float disc_single_jump,
-        int msg_type = CV_32F);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
-    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-
-    //! version for user specified data term
-    void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    int ndisp;
-
-    int iters;
-    int levels;
-
-    float max_data_term;
-    float data_weight;
-    float max_disc_term;
-    float disc_single_jump;
-
-    int msg_type;
-private:
-    GpuMat u, d, l, r, u2, d2, l2, r2;
-    std::vector<GpuMat> datas;
-    GpuMat out;
-};

 /////////////////////////// StereoConstantSpaceBP ///////////////////////////
-// "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
-// Qingxiong Yang, Liang Wang, Narendra Ahuja
-// http://vision.ai.uiuc.edu/~qyang6/

-class CV_EXPORTS StereoConstantSpaceBP
-{
-public:
-    enum { DEFAULT_NDISP    = 128 };
-    enum { DEFAULT_ITERS    = 8   };
-    enum { DEFAULT_LEVELS   = 4   };
-    enum { DEFAULT_NR_PLANE = 4   };

-    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
-
-    //! the default constructor
-    explicit StereoConstantSpaceBP(int ndisp    = DEFAULT_NDISP,
-                                   int iters    = DEFAULT_ITERS,
-                                   int levels   = DEFAULT_LEVELS,
-                                   int nr_plane = DEFAULT_NR_PLANE,
-                                   int msg_type = CV_32F);
-
-    //! the full constructor taking the number of disparities, number of BP iterations on each level,
-    //! number of levels, number of active disparity on the first level, truncation of data cost, data weight,
-    //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold
-    StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-        float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
-        int min_disp_th = 0,
-        int msg_type = CV_32F);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
-    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    int ndisp;
-
-    int iters;
-    int levels;
-
-    int nr_plane;
-
-    float max_data_term;
-    float data_weight;
-    float max_disc_term;
-    float disc_single_jump;
-
-    int min_disp_th;
-
-    int msg_type;
-
-    bool use_local_init_data_cost;
-private:
-    GpuMat messages_buffers;
-
-    GpuMat temp;
-    GpuMat out;
-};

 /////////////////////////// DisparityBilateralFilter ///////////////////////////
-// Disparity map refinement using joint bilateral filtering given a single color image.
-// Qingxiong Yang, Liang Wang, Narendra Ahuja
-// http://vision.ai.uiuc.edu/~qyang6/

-class CV_EXPORTS DisparityBilateralFilter
-{
-public:
-    enum { DEFAULT_NDISP  = 64 };
-    enum { DEFAULT_RADIUS = 3 };
-    enum { DEFAULT_ITERS  = 1 };
-
-    //! the default constructor
-    explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
-
-    //! the full constructor taking the number of disparities, filter radius,
-    //! number of iterations, truncation of data continuity, truncation of disparity continuity
-    //! and filter range sigma
-    DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range);
-
-    //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
-    //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
-    void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null());
-
-private:
-    int ndisp;
-    int radius;
-    int iters;
-
-    float edge_threshold;
-    float max_disc_threshold;
-    float sigma_range;
-
-    GpuMat table_color;
-    GpuMat table_space;
-};


 //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -1,371 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-using namespace perf;
-
-//////////////////////////////////////////////////////////////////////
-// StereoBM
-
-typedef std::tr1::tuple<string, string> pair_string;
-DEF_PARAM_TEST_1(ImagePair, pair_string);
-
-PERF_TEST_P(ImagePair, Calib3D_StereoBM,
-            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
-{
-    declare.time(300.0);
-
-    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgLeft.empty());
-
-    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgRight.empty());
-
-    const int preset = 0;
-    const int ndisp = 256;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
-
-        const cv::gpu::GpuMat d_imgLeft(imgLeft);
-        const cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() d_bm(d_imgLeft, d_imgRight, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Ptr<cv::StereoBM> bm = cv::createStereoBM(ndisp);
-
-        cv::Mat dst;
-
-        TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// StereoBeliefPropagation
-
-PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
-            Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
-{
-    declare.time(300.0);
-
-    const cv::Mat imgLeft = readImage(GET_PARAM(0));
-    ASSERT_FALSE(imgLeft.empty());
-
-    const cv::Mat imgRight = readImage(GET_PARAM(1));
-    ASSERT_FALSE(imgRight.empty());
-
-    const int ndisp = 64;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
-
-        const cv::gpu::GpuMat d_imgLeft(imgLeft);
-        const cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() d_bp(d_imgLeft, d_imgRight, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// StereoConstantSpaceBP
-
-PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
-            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
-{
-    declare.time(300.0);
-
-    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgLeft.empty());
-
-    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgRight.empty());
-
-    const int ndisp = 128;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
-
-        const cv::gpu::GpuMat d_imgLeft(imgLeft);
-        const cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() d_csbp(d_imgLeft, d_imgRight, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// DisparityBilateralFilter
-
-PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter,
-            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
-{
-    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    const cv::Mat disp = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(disp.empty());
-
-    const int ndisp = 128;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
-
-        const cv::gpu::GpuMat d_img(img);
-        const cv::gpu::GpuMat d_disp(disp);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() d_filter(d_disp, d_img, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// TransformPoints
-
-DEF_PARAM_TEST_1(Count, int);
-
-PERF_TEST_P(Count, Calib3D_TransformPoints,
-            Values(5000, 10000, 20000))
-{
-    const int count = GetParam();
-
-    cv::Mat src(1, count, CV_32FC3);
-    declare.in(src, WARMUP_RNG);
-
-    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::transformPoints(d_src, rvec, tvec, dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ProjectPoints
-
-PERF_TEST_P(Count, Calib3D_ProjectPoints,
-            Values(5000, 10000, 20000))
-{
-    const int count = GetParam();
-
-    cv::Mat src(1, count, CV_32FC3);
-    declare.in(src, WARMUP_RNG);
-
-    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// SolvePnPRansac
-
-PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
-            Values(5000, 10000, 20000))
-{
-    declare.time(10.0);
-
-    const int count = GetParam();
-
-    cv::Mat object(1, count, CV_32FC3);
-    declare.in(object, WARMUP_RNG);
-
-    cv::Mat camera_mat(3, 3, CV_32FC1);
-    cv::randu(camera_mat, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
-
-    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    cv::randu(rvec_gold, 0, 1);
-
-    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    cv::randu(tvec_gold, 0, 1);
-
-    std::vector<cv::Point2f> image_vec;
-    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
-
-    const cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
-
-    cv::Mat rvec;
-    cv::Mat tvec;
-
-    if (PERF_RUN_GPU())
-    {
-        TEST_CYCLE() cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-        GPU_SANITY_CHECK(rvec, 1e-3);
-        GPU_SANITY_CHECK(tvec, 1e-3);
-    }
-    else
-    {
-        TEST_CYCLE() cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-        CPU_SANITY_CHECK(rvec, 1e-6);
-        CPU_SANITY_CHECK(tvec, 1e-6);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ReprojectImageTo3D
-
-PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-
-    cv::Mat src(size, depth);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat Q(4, 4, CV_32FC1);
-    cv::randu(Q, 0.1, 1.0);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::reprojectImageTo3D(d_src, dst, Q);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::reprojectImageTo3D(src, dst, Q);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// DrawColorDisp
-
-PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::drawColorDisp(d_src, dst, 255);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -1,294 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace transform_points
-    {
-        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
-    }
-
-    namespace project_points
-    {
-        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
-    }
-
-    namespace solve_pnp_ransac
-    {
-        int maxNumIters();
-
-        void computeHypothesisScores(
-                const int num_hypotheses, const int num_points, const float* rot_matrices,
-                const float3* transl_vectors, const float3* object, const float2* image,
-                const float dist_threshold, int* hypothesis_scores);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev;
-
-namespace
-{
-    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
-        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
-        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
-
-        // Convert rotation vector into matrix
-        Mat rot;
-        Rodrigues(rvec, rot);
-
-        dst.create(src.size(), src.type());
-        transform_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), dst, stream);
-    }
-}
-
-void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
-{
-    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
-}
-
-namespace
-{
-    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
-        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
-        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
-        CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
-        CV_Assert(dist_coef.empty()); // Undistortion isn't supported
-
-        // Convert rotation vector into matrix
-        Mat rot;
-        Rodrigues(rvec, rot);
-
-        dst.create(src.size(), CV_32FC2);
-        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
-    }
-}
-
-void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
-{
-    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
-}
-
-namespace
-{
-    // Selects subset_size random different points from [0, num_points - 1] range
-    void selectRandom(int subset_size, int num_points, std::vector<int>& subset)
-    {
-        subset.resize(subset_size);
-        for (int i = 0; i < subset_size; ++i)
-        {
-            bool was;
-            do
-            {
-                subset[i] = rand() % num_points;
-                was = false;
-                for (int j = 0; j < i; ++j)
-                    if (subset[j] == subset[i])
-                    {
-                        was = true;
-                        break;
-                    }
-            } while (was);
-        }
-    }
-
-    // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator
-    {
-    public:
-        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
-                                     const Mat& camera_mat_, int num_points_, int subset_size_,
-                                     Mat rot_matrices_, Mat transl_vectors_)
-                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
-                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
-                  transl_vectors(transl_vectors_) {}
-
-        void operator()(const BlockedRange& range) const
-        {
-            // Input data for generation of the current hypothesis
-            std::vector<int> subset_indices(subset_size);
-            Mat_<Point3f> object_subset(1, subset_size);
-            Mat_<Point2f> image_subset(1, subset_size);
-
-            // Current hypothesis data
-            Mat rot_vec(1, 3, CV_64F);
-            Mat rot_mat(3, 3, CV_64F);
-            Mat transl_vec(1, 3, CV_64F);
-
-            for (int iter = range.begin(); iter < range.end(); ++iter)
-            {
-                selectRandom(subset_size, num_points, subset_indices);
-                for (int i = 0; i < subset_size; ++i)
-                {
-                   object_subset(0, i) = object->at<Point3f>(subset_indices[i]);
-                   image_subset(0, i) = image->at<Point2f>(subset_indices[i]);
-                }
-
-                solvePnP(object_subset, image_subset, *camera_mat, *dist_coef, rot_vec, transl_vec);
-
-                // Remember translation vector
-                Mat transl_vec_ = transl_vectors.colRange(iter * 3, (iter + 1) * 3);
-                transl_vec = transl_vec.reshape(0, 1);
-                transl_vec.convertTo(transl_vec_, CV_32F);
-
-                // Remember rotation matrix
-                Rodrigues(rot_vec, rot_mat);
-                Mat rot_mat_ = rot_matrices.colRange(iter * 9, (iter + 1) * 9).reshape(0, 3);
-                rot_mat.convertTo(rot_mat_, CV_32F);
-            }
-        }
-
-        const Mat* object;
-        const Mat* image;
-        const Mat* dist_coef;
-        const Mat* camera_mat;
-        int num_points;
-        int subset_size;
-
-        // Hypotheses storage (global)
-        Mat rot_matrices;
-        Mat transl_vectors;
-    };
-}
-
-void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
-                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
-                             int num_iters, float max_dist, int min_inlier_count,
-                             std::vector<int>* inliers)
-{
-    (void)min_inlier_count;
-    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
-    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
-    CV_Assert(object.cols == image.cols);
-    CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
-    CV_Assert(!use_extrinsic_guess); // We don't support initial guess for now
-    CV_Assert(num_iters <= solve_pnp_ransac::maxNumIters());
-
-    const int subset_size = 4;
-    const int num_points = object.cols;
-    CV_Assert(num_points >= subset_size);
-
-    // Unapply distortion and intrinsic camera transformations
-    Mat eye_camera_mat = Mat::eye(3, 3, CV_32F);
-    Mat empty_dist_coef;
-    Mat image_normalized;
-    undistortPoints(image, image_normalized, camera_mat, dist_coef, Mat(), eye_camera_mat);
-
-    // Hypotheses storage (global)
-    Mat rot_matrices(1, num_iters * 9, CV_32F);
-    Mat transl_vectors(1, num_iters * 3, CV_32F);
-
-    // Generate set of hypotheses using small subsets of the input data
-    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
-                                      num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for(BlockedRange(0, num_iters), body);
-
-    // Compute scores (i.e. number of inliers) for each hypothesis
-    GpuMat d_object(object);
-    GpuMat d_image_normalized(image_normalized);
-    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
-    solve_pnp_ransac::computeHypothesisScores(
-            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
-            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
-            d_hypothesis_scores.ptr<int>());
-
-    // Find the best hypothesis index
-    Point best_idx;
-    double best_score;
-    minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
-    int num_inliers = static_cast<int>(best_score);
-
-    // Extract the best hypothesis data
-
-    Mat rot_mat = rot_matrices.colRange(best_idx.x * 9, (best_idx.x + 1) * 9).reshape(0, 3);
-    Rodrigues(rot_mat, rvec);
-    rvec = rvec.reshape(0, 1);
-
-    tvec = transl_vectors.colRange(best_idx.x * 3, (best_idx.x + 1) * 3).clone();
-    tvec = tvec.reshape(0, 1);
-
-    // Build vector of inlier indices
-    if (inliers != NULL)
-    {
-        inliers->clear();
-        inliers->reserve(num_inliers);
-
-        Point3f p, p_transf;
-        Point2f p_proj;
-        const float* rot = rot_mat.ptr<float>();
-        const float* transl = tvec.ptr<float>();
-
-        for (int i = 0; i < num_points; ++i)
-        {
-            p = object.at<Point3f>(0, i);
-            p_transf.x = rot[0] * p.x + rot[1] * p.y + rot[2] * p.z + transl[0];
-            p_transf.y = rot[3] * p.x + rot[4] * p.y + rot[5] * p.z + transl[1];
-            p_transf.z = rot[6] * p.x + rot[7] * p.y + rot[8] * p.z + transl[2];
-            p_proj.x = p_transf.x / p_transf.z;
-            p_proj.y = p_transf.y / p_transf.z;
-            if (norm(p_proj - image_normalized.at<Point2f>(0, i)) < max_dist)
-                inliers->push_back(i);
-        }
-    }
-}
-
-#endif
-
-
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -1,193 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
-
-    namespace transform_points
-    {
-        __constant__ float3 crot0;
-        __constant__ float3 crot1;
-        __constant__ float3 crot2;
-        __constant__ float3 ctransl;
-
-        struct TransformOp : unary_function<float3, float3>
-        {
-            __device__ __forceinline__ float3 operator()(const float3& p) const
-            {
-                return make_float3(
-                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-            }
-            __device__ __forceinline__ TransformOp() {}
-            __device__ __forceinline__ TransformOp(const TransformOp&) {}
-        };
-
-        void call(const PtrStepSz<float3> src, const float* rot,
-                  const float* transl, PtrStepSz<float3> dst,
-                  cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            cv::gpu::cudev::transform(src, dst, TransformOp(), WithOutMask(), stream);
-        }
-    } // namespace transform_points
-
-    namespace project_points
-    {
-        __constant__ float3 crot0;
-        __constant__ float3 crot1;
-        __constant__ float3 crot2;
-        __constant__ float3 ctransl;
-        __constant__ float3 cproj0;
-        __constant__ float3 cproj1;
-
-        struct ProjectOp : unary_function<float3, float3>
-        {
-            __device__ __forceinline__ float2 operator()(const float3& p) const
-            {
-                // Rotate and translate in 3D
-                float3 t = make_float3(
-                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-                // Project on 2D plane
-                return make_float2(
-                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
-                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
-            }
-            __device__ __forceinline__ ProjectOp() {}
-            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
-        };
-
-        void call(const PtrStepSz<float3> src, const float* rot,
-                  const float* transl, const float* proj, PtrStepSz<float2> dst,
-                  cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-            cv::gpu::cudev::transform(src, dst, ProjectOp(), WithOutMask(), stream);
-        }
-    } // namespace project_points
-
-    namespace solve_pnp_ransac
-    {
-        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
-        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
-
-        int maxNumIters()
-        {
-            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
-        }
-
-        __device__ __forceinline__ float sqr(float x)
-        {
-            return x * x;
-        }
-
-        template <int BLOCK_SIZE>
-        __global__ void computeHypothesisScoresKernel(
-                const int num_points, const float3* object, const float2* image,
-                const float dist_threshold, int* g_num_inliers)
-        {
-            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
-            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
-            int num_inliers = 0;
-
-            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
-            {
-                float3 p = object[i];
-                p = make_float3(
-                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
-                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
-                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
-                p.x /= p.z;
-                p.y /= p.z;
-                float2 image_p = image[i];
-                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
-                    ++num_inliers;
-            }
-
-            __shared__ int s_num_inliers[BLOCK_SIZE];
-            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
-
-            if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = num_inliers;
-        }
-
-        void computeHypothesisScores(
-                const int num_hypotheses, const int num_points, const float* rot_matrices,
-                const float3* transl_vectors, const float3* object, const float2* image,
-                const float dist_threshold, int* hypothesis_scores)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
-
-            dim3 threads(256);
-            dim3 grid(num_hypotheses);
-
-            computeHypothesisScoresKernel<256><<<grid, threads>>>(
-                    num_points, object, image, dist_threshold, hypothesis_scores);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace solvepnp_ransac
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/disp_bilateral_filter.cu
+++ b/modules/gpu/src/cuda/disp_bilateral_filter.cu
@@ -1,223 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace disp_bilateral_filter
-    {
-        __constant__ float* ctable_color;
-        __constant__ float* ctable_space;
-        __constant__ size_t ctable_space_step;
-
-        __constant__ int cndisp;
-        __constant__ int cradius;
-
-        __constant__ short cedge_disc;
-        __constant__ short cmax_disc;
-
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
-            size_t table_space_step = table_space.step / sizeof(float);
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
-        }
-
-        template <int channels>
-        struct DistRgbMax
-        {
-            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-            {
-                uchar x = ::abs(a[0] - b[0]);
-                uchar y = ::abs(a[1] - b[1]);
-                uchar z = ::abs(a[2] - b[2]);
-                return (::max(::max(x, y), z));
-            }
-        };
-
-        template <>
-        struct DistRgbMax<1>
-        {
-            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-            {
-                return ::abs(a[0] - b[0]);
-            }
-        };
-
-        template <int channels, typename T>
-        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-        {
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-            T dp[5];
-
-            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-            {
-                dp[0] = *(disp + (y  ) * disp_step + x + 0);
-                dp[1] = *(disp + (y-1) * disp_step + x + 0);
-                dp[2] = *(disp + (y  ) * disp_step + x - 1);
-                dp[3] = *(disp + (y+1) * disp_step + x + 0);
-                dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
-                {
-                    const int ymin = ::max(0, y - cradius);
-                    const int xmin = ::max(0, x - cradius);
-                    const int ymax = ::min(h - 1, y + cradius);
-                    const int xmax = ::min(w - 1, x + cradius);
-
-                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-                    const uchar* ic = img + y * img_step + channels * x;
-
-                    for(int yi = ymin; yi <= ymax; yi++)
-                    {
-                        const T* disp_y = disp + yi * disp_step;
-
-                        for(int xi = xmin; xi <= xmax; xi++)
-                        {
-                            const uchar* in = img + yi * img_step + channels * xi;
-
-                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
-
-                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
-
-                            const T disp_reg = disp_y[xi];
-
-                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
-                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
-                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
-                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
-                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
-                        }
-                    }
-
-                    float minimum = numeric_limits<float>::max();
-                    int id = 0;
-
-                    if (cost[0] < minimum)
-                    {
-                        minimum = cost[0];
-                        id = 0;
-                    }
-                    if (cost[1] < minimum)
-                    {
-                        minimum = cost[1];
-                        id = 1;
-                    }
-                    if (cost[2] < minimum)
-                    {
-                        minimum = cost[2];
-                        id = 2;
-                    }
-                    if (cost[3] < minimum)
-                    {
-                        minimum = cost[3];
-                        id = 3;
-                    }
-                    if (cost[4] < minimum)
-                    {
-                        minimum = cost[4];
-                        id = 4;
-                    }
-
-                    *(disp + y * disp_step + x) = dp[id];
-                }
-            }
-        }
-
-        template <typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(disp.cols, threads.x << 1);
-            grid.y = divUp(disp.rows, threads.y);
-
-            switch (channels)
-            {
-            case 1:
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-                }
-                break;
-            case 3:
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-                }
-                break;
-            default:
-                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    } // namespace bilateral_filter
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -1,540 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereobm
-    {
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-
-        #define ROWSperTHREAD 21     // the number of rows a thread will process
-
-        #define BLOCK_W 128          // the thread block width (464)
-        #define N_DISPARITIES 8
-
-        #define STEREO_MIND 0                    // The minimum d range to check
-        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
-
-        __constant__ unsigned int* cminSSDImage;
-        __constant__ size_t cminSSD_step;
-        __constant__ int cwidth;
-        __constant__ int cheight;
-
-        __device__ __forceinline__ int SQ(int a)
-        {
-            return a * a;
-        }
-
-        template<int RADIUS>
-        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-        {
-            unsigned int cache = 0;
-            unsigned int cache2 = 0;
-
-            for(int i = 1; i <= RADIUS; i++)
-                cache += col_ssd[i];
-
-            col_ssd_cache[0] = cache;
-
-            __syncthreads();
-
-            if (threadIdx.x < BLOCK_W - RADIUS)
-                cache2 = col_ssd_cache[RADIUS];
-            else
-                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
-                    cache2 += col_ssd[i];
-
-            return col_ssd[0] + cache + cache2;
-        }
-
-        template<int RADIUS>
-        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-        {
-            unsigned int ssd[N_DISPARITIES];
-
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
-
-            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
-
-            int bestIdx = 0;
-            for (int i = 0; i < N_DISPARITIES; i++)
-            {
-                if (mssd == ssd[i])
-                    bestIdx = i;
-            }
-
-            return make_uint2(mssd, bestIdx);
-        }
-
-        template<int RADIUS>
-        __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-        {
-            unsigned char leftPixel1;
-            unsigned char leftPixel2;
-            unsigned char rightPixel1[8];
-            unsigned char rightPixel2[8];
-            unsigned int diff1, diff2;
-
-            leftPixel1 = imageL[idx1];
-            leftPixel2 = imageL[idx2];
-
-            idx1 = idx1 - d;
-            idx2 = idx2 - d;
-
-            rightPixel1[7] = imageR[idx1 - 7];
-            rightPixel1[0] = imageR[idx1 - 0];
-            rightPixel1[1] = imageR[idx1 - 1];
-            rightPixel1[2] = imageR[idx1 - 2];
-            rightPixel1[3] = imageR[idx1 - 3];
-            rightPixel1[4] = imageR[idx1 - 4];
-            rightPixel1[5] = imageR[idx1 - 5];
-            rightPixel1[6] = imageR[idx1 - 6];
-
-            rightPixel2[7] = imageR[idx2 - 7];
-            rightPixel2[0] = imageR[idx2 - 0];
-            rightPixel2[1] = imageR[idx2 - 1];
-            rightPixel2[2] = imageR[idx2 - 2];
-            rightPixel2[3] = imageR[idx2 - 3];
-            rightPixel2[4] = imageR[idx2 - 4];
-            rightPixel2[5] = imageR[idx2 - 5];
-            rightPixel2[6] = imageR[idx2 - 6];
-
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            diff1 = leftPixel1 - rightPixel1[0];
-            diff2 = leftPixel2 - rightPixel2[0];
-            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[1];
-            diff2 = leftPixel2 - rightPixel2[1];
-            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[2];
-            diff2 = leftPixel2 - rightPixel2[2];
-            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[3];
-            diff2 = leftPixel2 - rightPixel2[3];
-            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[4];
-            diff2 = leftPixel2 - rightPixel2[4];
-            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[5];
-            diff2 = leftPixel2 - rightPixel2[5];
-            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[6];
-            diff2 = leftPixel2 - rightPixel2[6];
-            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[7];
-            diff2 = leftPixel2 - rightPixel2[7];
-            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-        }
-
-        template<int RADIUS>
-        __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-        {
-            unsigned char leftPixel1;
-            int idx;
-            unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-            for(int i = 0; i < (2 * RADIUS + 1); i++)
-            {
-                idx = y_tex * im_pitch + x_tex;
-                leftPixel1 = imageL[idx];
-                idx = idx - d;
-
-                diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
-                diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
-                diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
-                diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
-                diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
-                diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
-                diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
-                diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
-
-                y_tex += 1;
-            }
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
-            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
-            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
-            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
-            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
-            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
-            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
-            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
-        }
-
-        template<int RADIUS>
-        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
-        {
-            extern __shared__ unsigned int col_ssd_cache[];
-            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
-            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
-
-            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
-            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
-            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
-            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
-            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
-
-            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
-            unsigned char* disparImage = disp.data + X + Y * disp.step;
-         /*   if (X < cwidth)
-            {
-                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-                    *ptr = 0xFFFFFFFF;
-            }*/
-            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
-            int y_tex;
-            int x_tex = X - RADIUS;
-
-            if (x_tex >= cwidth)
-                return;
-
-            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
-            {
-                y_tex = Y - RADIUS;
-
-                InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
-
-                if (col_ssd_extra > 0)
-                    if (x_tex + BLOCK_W < cwidth)
-                        InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
-
-                __syncthreads(); //before MinSSD function
-
-                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
-                {
-                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                    if (minSSD.x < minSSDImage[0])
-                    {
-                        disparImage[0] = (unsigned char)(d + minSSD.y);
-                        minSSDImage[0] = minSSD.x;
-                    }
-                }
-
-                for(int row = 1; row < end_row; row++)
-                {
-                    int idx1 = y_tex * img_step + x_tex;
-                    int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
-
-                    __syncthreads();
-
-                    StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
-
-                    if (col_ssd_extra)
-                        if (x_tex + BLOCK_W < cwidth)
-                            StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
-
-                    y_tex += 1;
-
-                    __syncthreads(); //before MinSSD function
-
-                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
-                    {
-                        int idx = row * cminSSD_step;
-                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                        if (minSSD.x < minSSDImage[idx])
-                        {
-                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                            minSSDImage[idx] = minSSD.x;
-                        }
-                    }
-                } // for row loop
-            } // for d loop
-        }
-
-
-        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
-        {
-            dim3 grid(1,1,1);
-            dim3 threads(BLOCK_W, 1, 1);
-
-            grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
-            grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
-
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
-
-            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        };
-
-        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
-
-        const static kernel_caller_t callers[] =
-        {
-            0,
-            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
-            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
-            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
-            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
-
-            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
-        };
-        const int calles_num = sizeof(callers)/sizeof(callers[0]);
-
-        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
-        {
-            int winsz2 = winsz >> 1;
-
-            if (winsz2 == 0 || winsz2 >= calles_num)
-                CV_Error(cv::Error::StsBadArg, "Unsupported window size");
-
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
-
-            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
-            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
-
-            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
-
-            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
-
-            callers[winsz2](left, right, disp, maxdisp, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-
-        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
-
-        __global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
-        {
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < output.cols && y < output.rows)
-            {
-                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
-                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
-                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
-
-
-                conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
-                output.ptr(y)[x] = conv & 0xFF;
-            }
-        }
-
-        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
-        {
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
-
-            dim3 threads(16, 16, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(input.cols, threads.x);
-            grid.y = divUp(input.rows, threads.y);
-
-            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
-        }
-
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-
-        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
-
-        __device__ __forceinline__ float sobel(int x, int y)
-        {
-            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
-                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
-                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
-            return fabs(conv);
-        }
-
-        __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
-        {
-            float cache = 0;
-            float cache2 = 0;
-            int winsz2 = winsz/2;
-
-            for(int i = 1; i <= winsz2; i++)
-                cache += cols[i];
-
-            cols_cache[0] = cache;
-
-            __syncthreads();
-
-            if (threadIdx.x < blockDim.x - winsz2)
-                cache2 = cols_cache[winsz2];
-            else
-                for(int i = winsz2 + 1; i < winsz; i++)
-                    cache2 += cols[i];
-
-            return cols[0] + cache + cache2;
-        }
-
-        #define RpT (2 * ROWSperTHREAD)  // got experimentally
-
-        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
-        {
-            int winsz2 = winsz/2;
-            int n_dirty_pixels = (winsz2) * 2;
-
-            extern __shared__ float cols_cache[];
-            float *cols = cols_cache + blockDim.x + threadIdx.x;
-            float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
-
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int beg_row = blockIdx.y * RpT;
-            int end_row = ::min(beg_row + RpT, disp.rows);
-
-            if (x < disp.cols)
-            {
-                int y = beg_row;
-
-                float sum = 0;
-                float sum_extra = 0;
-
-                for(int i = y - winsz2; i <= y + winsz2; ++i)
-                {
-                    sum += sobel(x - winsz2, i);
-                    if (cols_extra)
-                        sum_extra += sobel(x + blockDim.x - winsz2, i);
-                }
-                *cols = sum;
-                if (cols_extra)
-                    *cols_extra = sum_extra;
-
-                __syncthreads();
-
-                float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-                if (sum_win < threshold)
-                    disp.data[y * disp.step + x] = 0;
-
-                __syncthreads();
-
-                for(int y = beg_row + 1; y < end_row; ++y)
-                {
-                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
-                    *cols = sum;
-
-                    if (cols_extra)
-                    {
-                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
-                        *cols_extra = sum_extra;
-                    }
-
-                    __syncthreads();
-                    float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-                    if (sum_win < threshold)
-                        disp.data[y * disp.step + x] = 0;
-
-                    __syncthreads();
-                }
-            }
-        }
-
-        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
-        {
-            avgTexturenessThreshold *= winsz * winsz;
-
-            texForTF.filterMode     = cudaFilterModeLinear;
-            texForTF.addressMode[0] = cudaAddressModeWrap;
-            texForTF.addressMode[1] = cudaAddressModeWrap;
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
-
-            dim3 threads(128, 1, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(input.cols, threads.x);
-            grid.y = divUp(input.rows, RpT);
-
-            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture (texForTF) );
-        }
-    } // namespace stereobm
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -1,538 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereobp
-    {
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// load constants ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        __constant__ int   cndisp;
-        __constant__ float cmax_data_term;
-        __constant__ float cdata_weight;
-        __constant__ float cmax_disc_term;
-        __constant__ float cdisc_single_jump;
-
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////////// comp data //////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <int cn> struct PixDiff;
-        template <> struct PixDiff<1>
-        {
-            __device__ __forceinline__ PixDiff(const uchar* ls)
-            {
-                l = *ls;
-            }
-            __device__ __forceinline__ float operator()(const uchar* rs) const
-            {
-                return ::abs((int)l - *rs);
-            }
-            uchar l;
-        };
-        template <> struct PixDiff<3>
-        {
-            __device__ __forceinline__ PixDiff(const uchar* ls)
-            {
-                l = *((uchar3*)ls);
-            }
-            __device__ __forceinline__ float operator()(const uchar* rs) const
-            {
-                const float tr = 0.299f;
-                const float tg = 0.587f;
-                const float tb = 0.114f;
-
-                float val  = tb * ::abs((int)l.x - rs[0]);
-                      val += tg * ::abs((int)l.y - rs[1]);
-                      val += tr * ::abs((int)l.z - rs[2]);
-
-                return val;
-            }
-            uchar3 l;
-        };
-        template <> struct PixDiff<4>
-        {
-            __device__ __forceinline__ PixDiff(const uchar* ls)
-            {
-                l = *((uchar4*)ls);
-            }
-            __device__ __forceinline__ float operator()(const uchar* rs) const
-            {
-                const float tr = 0.299f;
-                const float tg = 0.587f;
-                const float tb = 0.114f;
-
-                uchar4 r = *((uchar4*)rs);
-
-                float val  = tb * ::abs((int)l.x - r.x);
-                      val += tg * ::abs((int)l.y - r.y);
-                      val += tr * ::abs((int)l.z - r.z);
-
-                return val;
-            }
-            uchar4 l;
-        };
-
-        template <int cn, typename D>
-        __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
-            {
-                const uchar* ls = left.ptr(y) + x * cn;
-                const PixDiff<cn> pixDiff(ls);
-                const uchar* rs = right.ptr(y) + x * cn;
-
-                D* ds = data.ptr(y) + x;
-                const size_t disp_step = data.step * left.rows / sizeof(D);
-
-                for (int disp = 0; disp < cndisp; disp++)
-                {
-                    if (x - disp >= 1)
-                    {
-                        float val = pixDiff(rs - disp * cn);
-
-                        ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
-                    }
-                    else
-                    {
-                        ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
-                    }
-                }
-            }
-        }
-
-        template<typename T, typename D>
-        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
-
-        template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        //////////////////////// data step down ///////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst_cols && y < dst_rows)
-            {
-                for (int d = 0; d < cndisp; ++d)
-                {
-                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
-
-                    dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
-                }
-            }
-        }
-
-        template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(dst_cols, threads.x);
-            grid.y = divUp(dst_rows, threads.y);
-
-            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////// level up messages  ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst_cols && y < dst_rows)
-            {
-                const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
-                const size_t src_disp_step = src.step * src_rows / sizeof(T);
-
-                T*       dstr = dst.ptr(y  ) + x;
-                const T* srcr = src.ptr(y/2) + x/2;
-
-                for (int d = 0; d < cndisp; ++d)
-                    dstr[d * dst_disp_step] = srcr[d * src_disp_step];
-            }
-        }
-
-        template <typename T>
-        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(dst_cols, threads.x);
-            grid.y = divUp(dst_rows, threads.y);
-
-            int src_idx = (dst_idx + 1) & 1;
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////  calc all iterations /////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __device__ void calc_min_linear_penalty(T* dst, size_t step)
-        {
-            float prev = dst[0];
-            float cur;
-            for (int disp = 1; disp < cndisp; ++disp)
-            {
-                prev += cdisc_single_jump;
-                cur = dst[step * disp];
-                if (prev < cur)
-                {
-                    cur = prev;
-                    dst[step * disp] = saturate_cast<T>(prev);
-                }
-                prev = cur;
-            }
-
-            prev = dst[(cndisp - 1) * step];
-            for (int disp = cndisp - 2; disp >= 0; disp--)
-            {
-                prev += cdisc_single_jump;
-                cur = dst[step * disp];
-                if (prev < cur)
-                {
-                    cur = prev;
-                    dst[step * disp] = saturate_cast<T>(prev);
-                }
-                prev = cur;
-            }
-        }
-
-        template <typename T>
-        __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
-        {
-            float minimum = cudev::numeric_limits<float>::max();
-
-            for(int i = 0; i < cndisp; ++i)
-            {
-                float dst_reg  = msg1[msg_disp_step * i];
-                      dst_reg += msg2[msg_disp_step * i];
-                      dst_reg += msg3[msg_disp_step * i];
-                      dst_reg += data[data_disp_step * i];
-
-                if (dst_reg < minimum)
-                    minimum = dst_reg;
-
-                dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
-            }
-
-            calc_min_linear_penalty(dst, msg_disp_step);
-
-            minimum += cmax_disc_term;
-
-            float sum = 0;
-            for(int i = 0; i < cndisp; ++i)
-            {
-                float dst_reg = dst[msg_disp_step * i];
-                if (dst_reg > minimum)
-                {
-                    dst_reg = minimum;
-                    dst[msg_disp_step * i] = saturate_cast<T>(minimum);
-                }
-                sum += dst_reg;
-            }
-            sum /= cndisp;
-
-            for(int i = 0; i < cndisp; ++i)
-                dst[msg_disp_step * i] -= sum;
-        }
-
-        template <typename T>
-        __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
-        {
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
-            {
-                T* us = u + y * elem_step + x;
-                T* ds = d + y * elem_step + x;
-                T* ls = l + y * elem_step + x;
-                T* rs = r + y * elem_step + x;
-                const T* dt = data.ptr(y) + x;
-
-                size_t msg_disp_step = elem_step * rows;
-                size_t data_disp_step = data.step * rows / sizeof(T);
-
-                message(us + elem_step, ls         + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
-                message(ds - elem_step, ls         + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
-                message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
-                message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
-            }
-        }
-
-        template <typename T>
-        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
-            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(cols, threads.x << 1);
-            grid.y = divUp(rows, threads.y);
-
-            int elem_step = (int)(u.step / sizeof(T));
-
-            for(int t = 0; t < iters; ++t)
-            {
-                one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        }
-
-        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////////////// output ////////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
-            PtrStepSz<short> disp)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-            {
-                const T* us = u + (y + 1) * elem_step + x;
-                const T* ds = d + (y - 1) * elem_step + x;
-                const T* ls = l + y * elem_step + (x + 1);
-                const T* rs = r + y * elem_step+ (x - 1);
-                const T* dt = data + y * elem_step + x;
-
-                size_t disp_step = disp.rows * elem_step;
-
-                int best = 0;
-                float best_val = numeric_limits<float>::max();
-                for (int d = 0; d < cndisp; ++d)
-                {
-                    float val  = us[d * disp_step];
-                          val += ds[d * disp_step];
-                          val += ls[d * disp_step];
-                          val += rs[d * disp_step];
-                          val += dt[d * disp_step];
-
-                    if (val < best_val)
-                    {
-                        best_val = val;
-                        best = d;
-                    }
-                }
-
-                disp.ptr(y)[x] = saturate_cast<short>(best);
-            }
-        }
-
-        template <typename T>
-        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
-            const PtrStepSz<short>& disp, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(disp.cols, threads.x);
-            grid.y = divUp(disp.rows, threads.y);
-
-            int elem_step = static_cast<int>(u.step/sizeof(T));
-
-            output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-        template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-    } // namespace stereobp
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -1,864 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereocsbp
-    {
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// load constants ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        __constant__ int cndisp;
-
-        __constant__ float cmax_data_term;
-        __constant__ float cdata_weight;
-        __constant__ float cmax_disc_term;
-        __constant__ float cdisc_single_jump;
-
-        __constant__ int cth;
-
-        __constant__ size_t cimg_step;
-        __constant__ size_t cmsg_step;
-        __constant__ size_t cdisp_step1;
-        __constant__ size_t cdisp_step2;
-
-        __constant__ uchar* cleft;
-        __constant__ uchar* cright;
-        __constant__ uchar* ctemp;
-
-
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-                            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
-            cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// init data cost ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <int channels> struct DataCostPerPixel;
-        template <> struct DataCostPerPixel<1>
-        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
-            }
-        };
-        template <> struct DataCostPerPixel<3>
-        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                float tb = 0.114f * ::abs((int)left[0] - right[0]);
-                float tg = 0.587f * ::abs((int)left[1] - right[1]);
-                float tr = 0.299f * ::abs((int)left[2] - right[2]);
-
-                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-            }
-        };
-        template <> struct DataCostPerPixel<4>
-        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                uchar4 l = *((const uchar4*)left);
-                uchar4 r = *((const uchar4*)right);
-
-                float tb = 0.114f * ::abs((int)l.x - r.x);
-                float tg = 0.587f * ::abs((int)l.y - r.y);
-                float tr = 0.299f * ::abs((int)l.z - r.z);
-
-                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-            }
-        };
-
-        template <typename T>
-        __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
-
-                for(int i = 0; i < nr_plane; i++)
-                {
-                    T minimum = cudev::numeric_limits<T>::max();
-                    int id = 0;
-                    for(int d = 0; d < cndisp; d++)
-                    {
-                        T cur = data_cost[d * cdisp_step1];
-                        if(cur < minimum)
-                        {
-                            minimum = cur;
-                            id = d;
-                        }
-                    }
-
-                    data_cost_selected[i  * cdisp_step1] = minimum;
-                    selected_disparity[i  * cdisp_step1] = id;
-                    data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
-                }
-            }
-        }
-
-
-        template <typename T>
-        __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
-
-                int nr_local_minimum = 0;
-
-                T prev = data_cost[0 * cdisp_step1];
-                T cur  = data_cost[1 * cdisp_step1];
-                T next = data_cost[2 * cdisp_step1];
-
-                for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
-                {
-                    if (cur < prev && cur < next)
-                    {
-                        data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
-                        selected_disparity[nr_local_minimum * cdisp_step1] = d;
-
-                        data_cost[d * cdisp_step1] = numeric_limits<T>::max();
-
-                        nr_local_minimum++;
-                    }
-                    prev = cur;
-                    cur = next;
-                    next = data_cost[(d + 1) * cdisp_step1];
-                }
-
-                for (int i = nr_local_minimum; i < nr_plane; i++)
-                {
-                    T minimum = numeric_limits<T>::max();
-                    int id = 0;
-
-                    for (int d = 0; d < cndisp; d++)
-                    {
-                        cur = data_cost[d * cdisp_step1];
-                        if (cur < minimum)
-                        {
-                            minimum = cur;
-                            id = d;
-                        }
-                    }
-                    data_cost_selected[i * cdisp_step1] = minimum;
-                    selected_disparity[i * cdisp_step1] = id;
-
-                    data_cost[id * cdisp_step1] = numeric_limits<T>::max();
-                }
-            }
-        }
-
-        template <typename T, int channels>
-        __global__ void init_data_cost(int h, int w, int level)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                int y0 = y << level;
-                int yt = (y + 1) << level;
-
-                int x0 = x << level;
-                int xt = (x + 1) << level;
-
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
-
-                for(int d = 0; d < cndisp; ++d)
-                {
-                    float val = 0.0f;
-                    for(int yi = y0; yi < yt; yi++)
-                    {
-                        for(int xi = x0; xi < xt; xi++)
-                        {
-                            int xr = xi - d;
-                            if(d < cth || xr < 0)
-                                val += cdata_weight * cmax_data_term;
-                            else
-                            {
-                                const uchar* lle = cleft + yi * cimg_step + xi * channels;
-                                const uchar* lri = cright + yi * cimg_step + xr * channels;
-
-                                val += DataCostPerPixel<channels>::compute(lle, lri);
-                            }
-                        }
-                    }
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-                }
-            }
-        }
-
-        template <typename T, int winsz, int channels>
-        __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
-        {
-            int x_out = blockIdx.x;
-            int y_out = blockIdx.y % h;
-            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
-
-            int tid = threadIdx.x;
-
-            if (d < cndisp)
-            {
-                int x0 = x_out << level;
-                int y0 = y_out << level;
-
-                int len = ::min(y0 + winsz, rows) - y0;
-
-                float val = 0.0f;
-                if (x0 + tid < cols)
-                {
-                    if (x0 + tid - d < 0 || d < cth)
-                        val = cdata_weight * cmax_data_term * len;
-                    else
-                    {
-                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
-
-                        for(int y = 0; y < len; ++y)
-                        {
-                            val += DataCostPerPixel<channels>::compute(lle, lri);
-
-                            lle += cimg_step;
-                            lri += cimg_step;
-                        }
-                    }
-                }
-
-                extern __shared__ float smem[];
-
-                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
-
-                T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
-
-                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-            }
-        }
-
-
-        template <typename T>
-        void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            switch (channels)
-            {
-            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
-            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
-            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template <typename T, int winsz>
-        void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
-        {
-            const int threadsNum = 256;
-            const size_t smem_size = threadsNum * sizeof(float);
-
-            dim3 threads(winsz, 1, threadsNum / winsz);
-            dim3 grid(w, h, 1);
-            grid.y *= divUp(ndisp, threads.z);
-
-            switch (channels)
-            {
-            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template<class T>
-        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
-        {
-
-            typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
-
-            static const InitDataCostCaller init_data_cost_callers[] =
-            {
-                init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
-                init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
-                init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
-            };
-
-            size_t disp_step = msg_step * h;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            if (use_local_init_data_cost == true)
-                get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-            else
-                get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-        template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////// compute data cost //////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T, int channels>
-        __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                int y0 = y << level;
-                int yt = (y + 1) << level;
-
-                int x0 = x << level;
-                int xt = (x + 1) << level;
-
-                const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
-                T* data_cost = data_cost_ + y * cmsg_step + x;
-
-                for(int d = 0; d < nr_plane; d++)
-                {
-                    float val = 0.0f;
-                    for(int yi = y0; yi < yt; yi++)
-                    {
-                        for(int xi = x0; xi < xt; xi++)
-                        {
-                            int sel_disp = selected_disparity[d * cdisp_step2];
-                            int xr = xi - sel_disp;
-
-                            if (xr < 0 || sel_disp < cth)
-                                val += cdata_weight * cmax_data_term;
-                            else
-                            {
-                                const uchar* left_x = cleft + yi * cimg_step + xi * channels;
-                                const uchar* right_x = cright + yi * cimg_step + xr * channels;
-
-                                val += DataCostPerPixel<channels>::compute(left_x, right_x);
-                            }
-                        }
-                    }
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-                }
-            }
-        }
-
-        template <typename T, int winsz, int channels>
-        __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
-        {
-            int x_out = blockIdx.x;
-            int y_out = blockIdx.y % h;
-            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
-
-            int tid = threadIdx.x;
-
-            const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
-            T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
-
-            if (d < nr_plane)
-            {
-                int sel_disp = selected_disparity[d * cdisp_step2];
-
-                int x0 = x_out << level;
-                int y0 = y_out << level;
-
-                int len = ::min(y0 + winsz, rows) - y0;
-
-                float val = 0.0f;
-                if (x0 + tid < cols)
-                {
-                    if (x0 + tid - sel_disp < 0 || sel_disp < cth)
-                        val = cdata_weight * cmax_data_term * len;
-                    else
-                    {
-                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
-
-                        for(int y = 0; y < len; ++y)
-                        {
-                            val += DataCostPerPixel<channels>::compute(lle, lri);
-
-                            lle += cimg_step;
-                            lri += cimg_step;
-                        }
-                    }
-                }
-
-                extern __shared__ float smem[];
-
-                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
-
-                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-            }
-        }
-
-        template <typename T>
-        void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
-                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            switch(channels)
-            {
-            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template <typename T, int winsz>
-        void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
-        {
-            const int threadsNum = 256;
-            const size_t smem_size = threadsNum * sizeof(float);
-
-            dim3 threads(winsz, 1, threadsNum / winsz);
-            dim3 grid(w, h, 1);
-            grid.y *= divUp(nr_plane, threads.z);
-
-            switch (channels)
-            {
-            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template<class T>
-        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
-        {
-            typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
-
-            static const ComputeDataCostCaller callers[] =
-            {
-                compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
-                compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
-                compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
-            };
-
-            size_t disp_step1 = msg_step * h;
-            size_t disp_step2 = msg_step * h2;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-        template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-
-        ///////////////////////////////////////////////////////////////
-        //////////////////////// init message /////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-
-         template <typename T>
-        __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
-                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                                                     T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
-                                                     const T* data_cost_cur, const T* disparity_selected_cur,
-                                                     int nr_plane, int nr_plane2)
-        {
-            for(int i = 0; i < nr_plane; i++)
-            {
-                T minimum = numeric_limits<T>::max();
-                int id = 0;
-                for(int j = 0; j < nr_plane2; j++)
-                {
-                    T cur = data_cost_new[j * cdisp_step1];
-                    if(cur < minimum)
-                    {
-                        minimum = cur;
-                        id = j;
-                    }
-                }
-
-                data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
-                disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
-
-                u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
-                d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
-                l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
-                r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
-
-                data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
-            }
-        }
-
-        template <typename T>
-        __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
-                                     const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
-                                     T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                                     T* data_cost_selected_, const T* data_cost_,
-                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
-                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step + x/2;
-                const T* l_cur = l_cur_ + (y/2)                * cmsg_step + ::min(w2-1, x/2 + 1);
-                const T* r_cur = r_cur_ + (y/2)                * cmsg_step + ::max(0, x/2 - 1);
-
-                T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
-
-                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
-                const T* data_cost = data_cost_ + y * cmsg_step + x;
-
-                for(int d = 0; d < nr_plane2; d++)
-                {
-                    int idx2 = d * cdisp_step2;
-
-                    T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
-                    data_cost_new[d * cdisp_step1] = val;
-                }
-
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
-
-                T* u_new = u_new_ + y * cmsg_step + x;
-                T* d_new = d_new_ + y * cmsg_step + x;
-                T* l_new = l_new_ + y * cmsg_step + x;
-                T* r_new = r_new_ + y * cmsg_step + x;
-
-                u_cur = u_cur_ + y/2 * cmsg_step + x/2;
-                d_cur = d_cur_ + y/2 * cmsg_step + x/2;
-                l_cur = l_cur_ + y/2 * cmsg_step + x/2;
-                r_cur = r_cur_ + y/2 * cmsg_step + x/2;
-
-                get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
-                                             data_cost_selected, disparity_selected_new, data_cost_new,
-                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2);
-            }
-        }
-
-
-        template<class T>
-        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
-                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                          T* data_cost_selected, const T* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
-        {
-
-            size_t disp_step1 = msg_step * h;
-            size_t disp_step2 = msg_step * h2;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
-                                                       u_cur, d_cur, l_cur, r_cur,
-                                                       selected_disp_pyr_new, selected_disp_pyr_cur,
-                                                       data_cost_selected, data_cost,
-                                                       h, w, nr_plane, h2, w2, nr_plane2);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
-                          const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
-                          short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
-                          short* data_cost_selected, const short* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-        template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
-                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
-                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
-                          float* data_cost_selected, const float* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////  calc all iterations /////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
-                                          const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
-        {
-            T minimum = numeric_limits<T>::max();
-
-            for(int d = 0; d < nr_plane; d++)
-            {
-                int idx = d * cdisp_step1;
-                T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
-
-                if(val < minimum)
-                    minimum = val;
-
-                msg_dst[idx] = val;
-            }
-
-            float sum = 0;
-            for(int d = 0; d < nr_plane; d++)
-            {
-                float cost_min = minimum + cmax_disc_term;
-                T src_disp_reg = src_disp[d * cdisp_step1];
-
-                for(int d2 = 0; d2 < nr_plane; d2++)
-                    cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
-
-                temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
-                sum += cost_min;
-            }
-            sum /= nr_plane;
-
-            for(int d = 0; d < nr_plane; d++)
-                msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
-        }
-
-        template <typename T>
-        __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
-        {
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-            int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
-
-            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-            {
-                const T* data = data_cost_selected + y * cmsg_step + x;
-
-                T* u = u_ + y * cmsg_step + x;
-                T* d = d_ + y * cmsg_step + x;
-                T* l = l_ + y * cmsg_step + x;
-                T* r = r_ + y * cmsg_step + x;
-
-                const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
-
-                T* temp = (T*)ctemp + y * cmsg_step + x;
-
-                message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
-                message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
-                message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
-                message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
-            }
-        }
-
-
-        template<class T>
-        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
-        {
-            size_t disp_step = msg_step * h;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x << 1);
-            grid.y = divUp(h, threads.y);
-
-            for(int t = 0; t < iters; ++t)
-            {
-                compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-        };
-
-        template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
-            int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
-            int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////////////// output ////////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-
-        template <typename T>
-        __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
-                                     const T* data_cost_selected, const T* disp_selected_pyr,
-                                     PtrStepSz<short> disp, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-            {
-                const T* data = data_cost_selected + y * cmsg_step + x;
-                const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
-
-                const T* u = u_ + (y+1) * cmsg_step + (x+0);
-                const T* d = d_ + (y-1) * cmsg_step + (x+0);
-                const T* l = l_ + (y+0) * cmsg_step + (x+1);
-                const T* r = r_ + (y+0) * cmsg_step + (x-1);
-
-                int best = 0;
-                T best_val = numeric_limits<T>::max();
-                for (int i = 0; i < nr_plane; ++i)
-                {
-                    int idx = i * cdisp_step1;
-                    T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
-
-                    if (val < best_val)
-                    {
-                        best_val = val;
-                        best = saturate_cast<short>(disp_selected[idx]);
-                    }
-                }
-                disp(y, x) = best;
-            }
-        }
-
-        template<class T>
-        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
-        {
-            size_t disp_step = disp.rows * msg_step;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(disp.cols, threads.x);
-            grid.y = divUp(disp.rows, threads.y);
-
-            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
-
-        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
-    } // namespace stereocsbp
-}}} // namespace cv { namespace gpu { namespace cudev {
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/disparity_bilateral_filter.cpp
+++ b/modules/gpu/src/disparity_bilateral_filter.cpp
@@ -1,157 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace disp_bilateral_filter
-    {
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
-
-        template<typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::disp_bilateral_filter;
-
-namespace
-{
-    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
-    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
-    const float DEFAULT_SIGMA_RANGE = 10.0f;
-
-    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
-    {
-        Mat cpu_table_color(1, len, CV_32F);
-
-        float* line = cpu_table_color.ptr<float>();
-
-        for(int i = 0; i < len; i++)
-            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
-
-        table_color.upload(cpu_table_color);
-    }
-
-    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
-    {
-        int half = (win_size >> 1);
-
-        Mat cpu_table_space(half + 1, half + 1, CV_32F);
-
-        for (int y = 0; y <= half; ++y)
-        {
-            float* row = cpu_table_space.ptr<float>(y);
-            for (int x = 0; x <= half; ++x)
-                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
-        }
-
-        table_space.upload(cpu_table_space);
-    }
-
-    template <typename T>
-    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
-                                   GpuMat& table_color, GpuMat& table_space,
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-    {
-        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
-        short max_disc = short(ndisp * max_disc_threshold + 0.5);
-
-        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
-
-        if (&dst != &disp)
-        {
-            if (stream)
-                stream.enqueueCopy(disp, dst);
-            else
-                disp.copyTo(dst);
-        }
-
-        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
-    }
-
-    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
-                                                GpuMat& table_color, GpuMat& table_space,
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
-
-    const bilateral_filter_operator_t operators[] =
-        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
-      sigma_range(DEFAULT_SIGMA_RANGE)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
-                                                     float max_disc_threshold_, float sigma_range_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
-      sigma_range(sigma_range_)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
@@ -1,140 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::StereoBM_GPU::StereoBM_GPU() { throw_no_cuda(); }
-cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_no_cuda(); }
-
-bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_no_cuda(); return false; }
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereobm
-    {
-        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
-        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
-        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
-    }
-}}}
-
-namespace
-{
-    const float defaultAvgTexThreshold = 3;
-}
-
-cv::gpu::StereoBM_GPU::StereoBM_GPU()
-    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)
-{
-}
-
-cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_)
-    : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold)
-{
-    const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
-    CV_Assert(0 < ndisp && ndisp <= max_supported_ndisp);
-    CV_Assert(ndisp % 8 == 0);
-    CV_Assert(winSize % 2 == 1);
-}
-
-bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
-{
-    if (0 == getCudaEnabledDeviceCount())
-        return false;
-
-    DeviceInfo device_info;
-
-    if (device_info.majorVersion() > 1 || device_info.multiProcessorCount() > 16)
-        return true;
-
-    return false;
-}
-
-namespace
-{
-    void stereo_bm_gpu_operator( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::stereobm;
-
-        CV_Assert(left.rows == right.rows && left.cols == right.cols);
-        CV_Assert(left.type() == CV_8UC1);
-        CV_Assert(right.type() == CV_8UC1);
-
-        disparity.create(left.size(), CV_8U);
-        minSSD.create(left.size(), CV_32S);
-
-        GpuMat le_for_bm =  left;
-        GpuMat ri_for_bm = right;
-
-        if (preset == StereoBM_GPU::PREFILTER_XSOBEL)
-        {
-            leBuf.create( left.size(),  left.type());
-            riBuf.create(right.size(), right.type());
-
-            prefilter_xsobel( left, leBuf, 31, stream);
-            prefilter_xsobel(right, riBuf, 31, stream);
-
-            le_for_bm = leBuf;
-            ri_for_bm = riBuf;
-        }
-
-        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);
-
-        if (avergeTexThreshold)
-            postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
-    }
-}
-
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)
-{
-    stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
@@ -1,368 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, int&, int&) { throw_no_cuda(); }
-
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); }
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, float, float, float, float, int) { throw_no_cuda(); }
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereobp
-    {
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
-        template<typename T, typename D>
-        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
-        template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-        template <typename T>
-        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-        template <typename T>
-        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
-            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-        template <typename T>
-        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
-            const PtrStepSz<short>& disp, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::stereobp;
-
-namespace
-{
-    const float DEFAULT_MAX_DATA_TERM = 10.0f;
-    const float DEFAULT_DATA_WEIGHT = 0.07f;
-    const float DEFAULT_MAX_DISC_TERM = 1.7f;
-    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
-}
-
-void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
-{
-    ndisp = width / 4;
-    if ((ndisp & 1) != 0)
-        ndisp++;
-
-    int mm = std::max(width, height);
-    iters = mm / 100 + 2;
-
-    levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
-    if (levels == 0) levels++;
-}
-
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-namespace
-{
-    class StereoBeliefPropagationImpl
-    {
-    public:
-        StereoBeliefPropagationImpl(StereoBeliefPropagation& rthis_,
-                                    GpuMat& u_, GpuMat& d_, GpuMat& l_, GpuMat& r_,
-                                    GpuMat& u2_, GpuMat& d2_, GpuMat& l2_, GpuMat& r2_,
-                                    std::vector<GpuMat>& datas_, GpuMat& out_)
-            : rthis(rthis_), u(u_), d(d_), l(l_), r(r_), u2(u2_), d2(d2_), l2(l2_), r2(r2_), datas(datas_), out(out_),
-              zero(Scalar::all(0)), scale(rthis_.msg_type == CV_32F ? 1.0f : 10.0f)
-        {
-            CV_Assert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels);
-            CV_Assert(rthis.msg_type == CV_32F || rthis.msg_type == CV_16S);
-            CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < std::numeric_limits<short>::max());
-        }
-
-        void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-        {
-            typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
-            static const comp_data_t comp_data_callers[2][5] =
-            {
-                {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
-                {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
-            };
-
-            CV_Assert(left.size() == right.size() && left.type() == right.type());
-            CV_Assert(left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4);
-
-            rows = left.rows;
-            cols = left.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(std::min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init(stream);
-
-            datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], StreamAccessor::getStream(stream));
-
-            calcBP(disp, stream);
-        }
-
-        void operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
-        {
-            CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
-
-            rows = data.rows / rthis.ndisp;
-            cols = data.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(std::min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init(stream);
-
-            datas[0] = data;
-
-            calcBP(disp, stream);
-        }
-    private:
-        void init(Stream& stream)
-        {
-            u.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            d.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            l.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            r.create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            if (rthis.levels & 1)
-            {
-                //can clear less area
-                if (stream)
-                {
-                    stream.enqueueMemSet(u, zero);
-                    stream.enqueueMemSet(d, zero);
-                    stream.enqueueMemSet(l, zero);
-                    stream.enqueueMemSet(r, zero);
-                }
-                else
-                {
-                    u.setTo(zero);
-                    d.setTo(zero);
-                    l.setTo(zero);
-                    r.setTo(zero);
-                }
-            }
-
-            if (rthis.levels > 1)
-            {
-                int less_rows = (rows + 1) / 2;
-                int less_cols = (cols + 1) / 2;
-
-                u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-
-                if ((rthis.levels & 1) == 0)
-                {
-                    if (stream)
-                    {
-                        stream.enqueueMemSet(u2, zero);
-                        stream.enqueueMemSet(d2, zero);
-                        stream.enqueueMemSet(l2, zero);
-                        stream.enqueueMemSet(r2, zero);
-                    }
-                    else
-                    {
-                        u2.setTo(zero);
-                        d2.setTo(zero);
-                        l2.setTo(zero);
-                        r2.setTo(zero);
-                    }
-                }
-            }
-
-            load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
-
-            datas.resize(rthis.levels);
-
-            cols_all.resize(rthis.levels);
-            rows_all.resize(rthis.levels);
-
-            cols_all[0] = cols;
-            rows_all[0] = rows;
-        }
-
-        void calcBP(GpuMat& disp, Stream& stream)
-        {
-            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-            static const data_step_down_t data_step_down_callers[2] =
-            {
-                data_step_down_gpu<short>, data_step_down_gpu<float>
-            };
-
-            typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-            static const level_up_messages_t level_up_messages_callers[2] =
-            {
-                level_up_messages_gpu<short>, level_up_messages_gpu<float>
-            };
-
-            typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-            static const calc_all_iterations_t calc_all_iterations_callers[2] =
-            {
-                calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
-            };
-
-            typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-            static const output_t output_callers[2] =
-            {
-                output_gpu<short>, output_gpu<float>
-            };
-
-            const int funcIdx = rthis.msg_type == CV_32F;
-
-            cudaStream_t cudaStream = StreamAccessor::getStream(stream);
-
-            for (int i = 1; i < rthis.levels; ++i)
-            {
-                cols_all[i] = (cols_all[i-1] + 1) / 2;
-                rows_all[i] = (rows_all[i-1] + 1) / 2;
-
-                datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
-
-                data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream);
-            }
-
-            PtrStepSzb mus[] = {u, u2};
-            PtrStepSzb mds[] = {d, d2};
-            PtrStepSzb mrs[] = {r, r2};
-            PtrStepSzb mls[] = {l, l2};
-
-            int mem_idx = (rthis.levels & 1) ? 0 : 1;
-
-            for (int i = rthis.levels - 1; i >= 0; --i)
-            {
-                // for lower level we have already computed messages by setting to zero
-                if (i != rthis.levels - 1)
-                    level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, cudaStream);
-
-                calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], cudaStream);
-
-                mem_idx = (mem_idx + 1) & 1;
-            }
-
-            if (disp.empty())
-                disp.create(rows, cols, CV_16S);
-
-            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-
-            if (stream)
-                stream.enqueueMemSet(out, zero);
-            else
-                out.setTo(zero);
-
-            output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream);
-
-            if (disp.type() != CV_16S)
-            {
-                if (stream)
-                    stream.enqueueConvert(out, disp, disp.type());
-                else
-                    out.convertTo(disp, disp.type());
-            }
-        }
-
-        StereoBeliefPropagation& rthis;
-
-        GpuMat& u;
-        GpuMat& d;
-        GpuMat& l;
-        GpuMat& r;
-
-        GpuMat& u2;
-        GpuMat& d2;
-        GpuMat& l2;
-        GpuMat& r2;
-
-        std::vector<GpuMat>& datas;
-        GpuMat& out;
-
-        const Scalar zero;
-        const float scale;
-
-        int rows, cols;
-
-        std::vector<int> cols_all, rows_all;
-    };
-}
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(left, right, disp, stream);
-}
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
-{
-    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(data, disp, stream);
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@@ -1,311 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, int&, int&, int&) { throw_no_cuda(); }
-
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); }
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, float, float, int, int) { throw_no_cuda(); }
-
-void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-#include "opencv2/core/utility.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereocsbp
-    {
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp);
-
-        template<class T>
-        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-        template<class T>
-        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-        template<class T>
-        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
-                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                          T* data_cost_selected, const T* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-        template<class T>
-        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-        template<class T>
-        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::stereocsbp;
-
-namespace
-{
-    const float DEFAULT_MAX_DATA_TERM = 30.0f;
-    const float DEFAULT_DATA_WEIGHT = 1.0f;
-    const float DEFAULT_MAX_DISC_TERM = 160.0f;
-    const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
-}
-
-void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
-{
-    ndisp = (int) ((float) width / 3.14f);
-    if ((ndisp & 1) != 0)
-        ndisp++;
-
-    int mm = std::max(width, height);
-    iters = mm / 100 + ((mm > 1200)? - 4 : 4);
-
-    levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
-    if (levels == 0) levels++;
-
-    nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
-}
-
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-                                                      int msg_type_)
-
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
-      msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-                                                      float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
-                                                      int min_disp_th_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
-      msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-template<class T>
-static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
-        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
-
-    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));
-
-    const Scalar zero = Scalar::all(0);
-
-    cudaStream_t cudaStream = StreamAccessor::getStream(stream);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////
-    // Init
-
-    int rows = left.rows;
-    int cols = left.cols;
-
-    rthis.levels = std::min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
-    int levels = rthis.levels;
-
-    // compute sizes
-    AutoBuffer<int> buf(levels * 3);
-    int* cols_pyr = buf;
-    int* rows_pyr = cols_pyr + levels;
-    int* nr_plane_pyr = rows_pyr + levels;
-
-    cols_pyr[0]     = cols;
-    rows_pyr[0]     = rows;
-    nr_plane_pyr[0] = rthis.nr_plane;
-
-    for (int i = 1; i < levels; i++)
-    {
-        cols_pyr[i]     = cols_pyr[i-1] / 2;
-        rows_pyr[i]     = rows_pyr[i-1] / 2;
-        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
-    }
-
-
-    GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
-
-
-    //allocate buffers
-    int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
-    buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
-    buffers_count += 1; //  data_cost_selected
-    mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type);
-
-    data_cost          = mbuf.rowRange(0, rows * rthis.nr_plane * 2);
-    data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane);
-
-    for(int k = 0; k < 2; ++k) // in/out
-    {
-        GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows);
-        GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
-
-        GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
-        for(int _r = 0; _r < 5; ++_r)
-        {
-            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
-            assert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
-        }
-    };
-
-    size_t elem_step = mbuf.step / sizeof(T);
-
-    Size temp_size = data_cost.size();
-    if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)
-        temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp);
-
-    temp.create(temp_size, DataType<T>::type);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Compute
-
-    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
-
-    if (stream)
-    {
-        stream.enqueueMemSet(l[0], zero);
-        stream.enqueueMemSet(d[0], zero);
-        stream.enqueueMemSet(r[0], zero);
-        stream.enqueueMemSet(u[0], zero);
-
-        stream.enqueueMemSet(l[1], zero);
-        stream.enqueueMemSet(d[1], zero);
-        stream.enqueueMemSet(r[1], zero);
-        stream.enqueueMemSet(u[1], zero);
-
-        stream.enqueueMemSet(data_cost, zero);
-        stream.enqueueMemSet(data_cost_selected, zero);
-    }
-    else
-    {
-        l[0].setTo(zero);
-        d[0].setTo(zero);
-        r[0].setTo(zero);
-        u[0].setTo(zero);
-
-        l[1].setTo(zero);
-        d[1].setTo(zero);
-        r[1].setTo(zero);
-        u[1].setTo(zero);
-
-        data_cost.setTo(zero);
-        data_cost_selected.setTo(zero);
-    }
-
-    int cur_idx = 0;
-
-    for (int i = levels - 1; i >= 0; i--)
-    {
-        if (i == levels - 1)
-        {
-            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
-                elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
-        }
-        else
-        {
-            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step,
-                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);
-
-            int new_idx = (cur_idx + 1) & 1;
-
-            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
-                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
-                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i],
-                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);
-
-            cur_idx = new_idx;
-        }
-
-        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step,
-                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
-    }
-
-    if (disp.empty())
-        disp.create(rows, cols, CV_16S);
-
-    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-
-    if (stream)
-        stream.enqueueMemSet(out, zero);
-    else
-        out.setTo(zero);
-
-    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);
-
-    if (disp.type() != CV_16S)
-    {
-        if (stream)
-            stream.enqueueConvert(out, disp, disp.type());
-        else
-            out.convertTo(disp, disp.type());
-    }
-}
-
-
-typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat& mbuf,
-                                     GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream);
-
-const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
-
-void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
-    operators[msg_type](*this, messages_buffers, temp, out, left, right, disp, stream);
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@@ -1,348 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace cvtest;
-
-//////////////////////////////////////////////////////////////////////////
-// StereoBM
-
-struct StereoBM : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(StereoBM, Regression)
-{
-    cv::Mat left_image  = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    cv::Mat right_image = readImage("stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
-    cv::Mat disp_gold   = readImage("stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-
-    ASSERT_FALSE(left_image.empty());
-    ASSERT_FALSE(right_image.empty());
-    ASSERT_FALSE(disp_gold.empty());
-
-    cv::gpu::StereoBM_GPU bm(0, 128, 19);
-    cv::gpu::GpuMat disp;
-
-    bm(loadMat(left_image), loadMat(right_image), disp);
-
-    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, StereoBM, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////////
-// StereoBeliefPropagation
-
-struct StereoBeliefPropagation : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(StereoBeliefPropagation, Regression)
-{
-    cv::Mat left_image  = readImage("stereobp/aloe-L.png");
-    cv::Mat right_image = readImage("stereobp/aloe-R.png");
-    cv::Mat disp_gold   = readImage("stereobp/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-
-    ASSERT_FALSE(left_image.empty());
-    ASSERT_FALSE(right_image.empty());
-    ASSERT_FALSE(disp_gold.empty());
-
-    cv::gpu::StereoBeliefPropagation bp(64, 8, 2, 25, 0.1f, 15, 1, CV_16S);
-    cv::gpu::GpuMat disp;
-
-    bp(loadMat(left_image), loadMat(right_image), disp);
-
-    cv::Mat h_disp(disp);
-    h_disp.convertTo(h_disp, disp_gold.depth());
-
-    EXPECT_MAT_NEAR(disp_gold, h_disp, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, StereoBeliefPropagation, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////////
-// StereoConstantSpaceBP
-
-struct StereoConstantSpaceBP : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(StereoConstantSpaceBP, Regression)
-{
-    cv::Mat left_image  = readImage("csstereobp/aloe-L.png");
-    cv::Mat right_image = readImage("csstereobp/aloe-R.png");
-
-    cv::Mat disp_gold;
-
-    if (supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-        disp_gold = readImage("csstereobp/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-    else
-        disp_gold = readImage("csstereobp/aloe-disp_CC1X.png", cv::IMREAD_GRAYSCALE);
-
-    ASSERT_FALSE(left_image.empty());
-    ASSERT_FALSE(right_image.empty());
-    ASSERT_FALSE(disp_gold.empty());
-
-    cv::gpu::StereoConstantSpaceBP csbp(128, 16, 4, 4);
-    cv::gpu::GpuMat disp;
-
-    csbp(loadMat(left_image), loadMat(right_image), disp);
-
-    cv::Mat h_disp(disp);
-    h_disp.convertTo(h_disp, disp_gold.depth());
-
-    EXPECT_MAT_NEAR(disp_gold, h_disp, 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, StereoConstantSpaceBP, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// transformPoints
-
-struct TransformPoints : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(TransformPoints, Accuracy)
-{
-    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
-    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
-    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::transformPoints(loadMat(src), rvec, tvec, dst);
-
-    ASSERT_EQ(src.size(), dst.size());
-    ASSERT_EQ(src.type(), dst.type());
-
-    cv::Mat h_dst(dst);
-
-    cv::Mat rot;
-    cv::Rodrigues(rvec, rot);
-
-    for (int i = 0; i < h_dst.cols; ++i)
-    {
-        cv::Point3f res = h_dst.at<cv::Point3f>(0, i);
-
-        cv::Point3f p = src.at<cv::Point3f>(0, i);
-        cv::Point3f res_gold(
-                rot.at<float>(0, 0) * p.x + rot.at<float>(0, 1) * p.y + rot.at<float>(0, 2) * p.z + tvec.at<float>(0, 0),
-                rot.at<float>(1, 0) * p.x + rot.at<float>(1, 1) * p.y + rot.at<float>(1, 2) * p.z + tvec.at<float>(0, 1),
-                rot.at<float>(2, 0) * p.x + rot.at<float>(2, 1) * p.y + rot.at<float>(2, 2) * p.z + tvec.at<float>(0, 2));
-
-        ASSERT_POINT3_NEAR(res_gold, res, 1e-5);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, TransformPoints, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// ProjectPoints
-
-struct ProjectPoints : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(ProjectPoints, Accuracy)
-{
-    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
-    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
-    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
-    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::projectPoints(loadMat(src), rvec, tvec, camera_mat, cv::Mat(), dst);
-
-    ASSERT_EQ(1, dst.rows);
-    ASSERT_EQ(MatType(CV_32FC2), MatType(dst.type()));
-
-    std::vector<cv::Point2f> dst_gold;
-    cv::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), dst_gold);
-
-    ASSERT_EQ(dst_gold.size(), static_cast<size_t>(dst.cols));
-
-    cv::Mat h_dst(dst);
-
-    for (size_t i = 0; i < dst_gold.size(); ++i)
-    {
-        cv::Point2f res = h_dst.at<cv::Point2f>(0, (int)i);
-        cv::Point2f res_gold = dst_gold[i];
-
-        ASSERT_LE(cv::norm(res_gold - res) / cv::norm(res_gold), 1e-3f);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ProjectPoints, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// SolvePnPRansac
-
-struct SolvePnPRansac : testing::TestWithParam<cv::gpu::DeviceInfo>
-{
-    cv::gpu::DeviceInfo devInfo;
-
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(SolvePnPRansac, Accuracy)
-{
-    cv::Mat object = randomMat(cv::Size(5000, 1), CV_32FC3, 0, 100);
-    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    std::vector<cv::Point2f> image_vec;
-    cv::Mat rvec_gold;
-    cv::Mat tvec_gold;
-    rvec_gold = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
-    tvec_gold = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
-    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), image_vec);
-
-    cv::Mat rvec, tvec;
-    std::vector<int> inliers;
-    cv::gpu::solvePnPRansac(object, cv::Mat(1, (int)image_vec.size(), CV_32FC2, &image_vec[0]),
-                            camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)),
-                            rvec, tvec, false, 200, 2.f, 100, &inliers);
-
-    ASSERT_LE(cv::norm(rvec - rvec_gold), 1e-3);
-    ASSERT_LE(cv::norm(tvec - tvec_gold), 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, SolvePnPRansac, ALL_DEVICES);
-
-////////////////////////////////////////////////////////////////////////////////
-// reprojectImageTo3D
-
-PARAM_TEST_CASE(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int depth;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        depth = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(ReprojectImageTo3D, Accuracy)
-{
-    cv::Mat disp = randomMat(size, depth, 5.0, 30.0);
-    cv::Mat Q = randomMat(cv::Size(4, 4), CV_32FC1, 0.1, 1.0);
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::reprojectImageTo3D(loadMat(disp, useRoi), dst, Q, 3);
-
-    cv::Mat dst_gold;
-    cv::reprojectImageTo3D(disp, dst_gold, Q, false);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16S)),
-    WHOLE_SUBMAT));
-
-#endif // HAVE_CUDA