renamed gpustereo -> cudastereo

2013-07-23 16:18:26 +04:00
parent d4901a99ea
commit 71d61e07b1
29 changed files with 21 additions and 21 deletions
--- a/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu
+++ b/modules/cudastereo/src/cuda/disparity_bilateral_filter.cu
@@ -0,0 +1,223 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace disp_bilateral_filter
+    {
+        __constant__ float* ctable_color;
+        __constant__ float* ctable_space;
+        __constant__ size_t ctable_space_step;
+
+        __constant__ int cndisp;
+        __constant__ int cradius;
+
+        __constant__ short cedge_disc;
+        __constant__ short cmax_disc;
+
+        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
+            size_t table_space_step = table_space.step / sizeof(float);
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
+        }
+
+        template <int channels>
+        struct DistRgbMax
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                uchar x = ::abs(a[0] - b[0]);
+                uchar y = ::abs(a[1] - b[1]);
+                uchar z = ::abs(a[2] - b[2]);
+                return (::max(::max(x, y), z));
+            }
+        };
+
+        template <>
+        struct DistRgbMax<1>
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                return ::abs(a[0] - b[0]);
+            }
+        };
+
+        template <int channels, typename T>
+        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            T dp[5];
+
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                dp[0] = *(disp + (y  ) * disp_step + x + 0);
+                dp[1] = *(disp + (y-1) * disp_step + x + 0);
+                dp[2] = *(disp + (y  ) * disp_step + x - 1);
+                dp[3] = *(disp + (y+1) * disp_step + x + 0);
+                dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
+                {
+                    const int ymin = ::max(0, y - cradius);
+                    const int xmin = ::max(0, x - cradius);
+                    const int ymax = ::min(h - 1, y + cradius);
+                    const int xmax = ::min(w - 1, x + cradius);
+
+                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+                    const uchar* ic = img + y * img_step + channels * x;
+
+                    for(int yi = ymin; yi <= ymax; yi++)
+                    {
+                        const T* disp_y = disp + yi * disp_step;
+
+                        for(int xi = xmin; xi <= xmax; xi++)
+                        {
+                            const uchar* in = img + yi * img_step + channels * xi;
+
+                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+
+                            const T disp_reg = disp_y[xi];
+
+                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                        }
+                    }
+
+                    float minimum = numeric_limits<float>::max();
+                    int id = 0;
+
+                    if (cost[0] < minimum)
+                    {
+                        minimum = cost[0];
+                        id = 0;
+                    }
+                    if (cost[1] < minimum)
+                    {
+                        minimum = cost[1];
+                        id = 1;
+                    }
+                    if (cost[2] < minimum)
+                    {
+                        minimum = cost[2];
+                        id = 2;
+                    }
+                    if (cost[3] < minimum)
+                    {
+                        minimum = cost[3];
+                        id = 3;
+                    }
+                    if (cost[4] < minimum)
+                    {
+                        minimum = cost[4];
+                        id = 4;
+                    }
+
+                    *(disp + y * disp_step + x) = dp[id];
+                }
+            }
+        }
+
+        template <typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(disp.cols, threads.x << 1);
+            grid.y = divUp(disp.rows, threads.y);
+
+            switch (channels)
+            {
+            case 1:
+                for (int i = 0; i < iters; ++i)
+                {
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            case 3:
+                for (int i = 0; i < iters; ++i)
+                {
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            default:
+                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+    } // namespace bilateral_filter
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -0,0 +1,540 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobm
+    {
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        #define ROWSperTHREAD 21     // the number of rows a thread will process
+
+        #define BLOCK_W 128          // the thread block width (464)
+        #define N_DISPARITIES 8
+
+        #define STEREO_MIND 0                    // The minimum d range to check
+        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
+
+        __constant__ unsigned int* cminSSDImage;
+        __constant__ size_t cminSSD_step;
+        __constant__ int cwidth;
+        __constant__ int cheight;
+
+        __device__ __forceinline__ int SQ(int a)
+        {
+            return a * a;
+        }
+
+        template<int RADIUS>
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {
+            unsigned int cache = 0;
+            unsigned int cache2 = 0;
+
+            for(int i = 1; i <= RADIUS; i++)
+                cache += col_ssd[i];
+
+            col_ssd_cache[0] = cache;
+
+            __syncthreads();
+
+            if (threadIdx.x < BLOCK_W - RADIUS)
+                cache2 = col_ssd_cache[RADIUS];
+            else
+                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
+                    cache2 += col_ssd[i];
+
+            return col_ssd[0] + cache + cache2;
+        }
+
+        template<int RADIUS>
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {
+            unsigned int ssd[N_DISPARITIES];
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
+
+            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
+
+            int bestIdx = 0;
+            for (int i = 0; i < N_DISPARITIES; i++)
+            {
+                if (mssd == ssd[i])
+                    bestIdx = i;
+            }
+
+            return make_uint2(mssd, bestIdx);
+        }
+
+        template<int RADIUS>
+        __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            unsigned char leftPixel2;
+            unsigned char rightPixel1[8];
+            unsigned char rightPixel2[8];
+            unsigned int diff1, diff2;
+
+            leftPixel1 = imageL[idx1];
+            leftPixel2 = imageL[idx2];
+
+            idx1 = idx1 - d;
+            idx2 = idx2 - d;
+
+            rightPixel1[7] = imageR[idx1 - 7];
+            rightPixel1[0] = imageR[idx1 - 0];
+            rightPixel1[1] = imageR[idx1 - 1];
+            rightPixel1[2] = imageR[idx1 - 2];
+            rightPixel1[3] = imageR[idx1 - 3];
+            rightPixel1[4] = imageR[idx1 - 4];
+            rightPixel1[5] = imageR[idx1 - 5];
+            rightPixel1[6] = imageR[idx1 - 6];
+
+            rightPixel2[7] = imageR[idx2 - 7];
+            rightPixel2[0] = imageR[idx2 - 0];
+            rightPixel2[1] = imageR[idx2 - 1];
+            rightPixel2[2] = imageR[idx2 - 2];
+            rightPixel2[3] = imageR[idx2 - 3];
+            rightPixel2[4] = imageR[idx2 - 4];
+            rightPixel2[5] = imageR[idx2 - 5];
+            rightPixel2[6] = imageR[idx2 - 6];
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            diff1 = leftPixel1 - rightPixel1[0];
+            diff2 = leftPixel2 - rightPixel2[0];
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[1];
+            diff2 = leftPixel2 - rightPixel2[1];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[2];
+            diff2 = leftPixel2 - rightPixel2[2];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[3];
+            diff2 = leftPixel2 - rightPixel2[3];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[4];
+            diff2 = leftPixel2 - rightPixel2[4];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[5];
+            diff2 = leftPixel2 - rightPixel2[5];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[6];
+            diff2 = leftPixel2 - rightPixel2[6];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[7];
+            diff2 = leftPixel2 - rightPixel2[7];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+        }
+
+        template<int RADIUS>
+        __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            int idx;
+            unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+            for(int i = 0; i < (2 * RADIUS + 1); i++)
+            {
+                idx = y_tex * im_pitch + x_tex;
+                leftPixel1 = imageL[idx];
+                idx = idx - d;
+
+                diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
+                diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
+                diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
+                diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
+                diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
+                diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
+                diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
+                diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
+
+                y_tex += 1;
+            }
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
+        }
+
+        template<int RADIUS>
+        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
+        {
+            extern __shared__ unsigned int col_ssd_cache[];
+            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
+            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
+
+            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
+            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
+            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
+            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
+            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
+
+            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+            unsigned char* disparImage = disp.data + X + Y * disp.step;
+         /*   if (X < cwidth)
+            {
+                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
+                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
+                    *ptr = 0xFFFFFFFF;
+            }*/
+            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
+            int y_tex;
+            int x_tex = X - RADIUS;
+
+            if (x_tex >= cwidth)
+                return;
+
+            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
+            {
+                y_tex = Y - RADIUS;
+
+                InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
+
+                if (col_ssd_extra > 0)
+                    if (x_tex + BLOCK_W < cwidth)
+                        InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
+
+                __syncthreads(); //before MinSSD function
+
+                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
+                {
+                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                    if (minSSD.x < minSSDImage[0])
+                    {
+                        disparImage[0] = (unsigned char)(d + minSSD.y);
+                        minSSDImage[0] = minSSD.x;
+                    }
+                }
+
+                for(int row = 1; row < end_row; row++)
+                {
+                    int idx1 = y_tex * img_step + x_tex;
+                    int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
+
+                    __syncthreads();
+
+                    StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
+
+                    if (col_ssd_extra)
+                        if (x_tex + BLOCK_W < cwidth)
+                            StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
+
+                    y_tex += 1;
+
+                    __syncthreads(); //before MinSSD function
+
+                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
+                    {
+                        int idx = row * cminSSD_step;
+                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                        if (minSSD.x < minSSDImage[idx])
+                        {
+                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
+                            minSSDImage[idx] = minSSD.x;
+                        }
+                    }
+                } // for row loop
+            } // for d loop
+        }
+
+
+        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
+        {
+            dim3 grid(1,1,1);
+            dim3 threads(BLOCK_W, 1, 1);
+
+            grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
+            grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
+
+            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        };
+
+        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
+
+        const static kernel_caller_t callers[] =
+        {
+            0,
+            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
+            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
+            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
+            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
+
+            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
+        };
+        const int calles_num = sizeof(callers)/sizeof(callers[0]);
+
+        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        {
+            int winsz2 = winsz >> 1;
+
+            if (winsz2 == 0 || winsz2 >= calles_num)
+                CV_Error(cv::Error::StsBadArg, "Unsupported window size");
+
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
+
+            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
+            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
+
+            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
+
+            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
+
+            callers[winsz2](left, right, disp, maxdisp, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
+
+        __global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
+        {
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < output.cols && y < output.rows)
+            {
+                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
+                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
+                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
+
+
+                conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
+                output.ptr(y)[x] = conv & 0xFF;
+            }
+        }
+
+        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
+
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, threads.y);
+
+            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
+        }
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
+
+        __device__ __forceinline__ float sobel(int x, int y)
+        {
+            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
+                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
+                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+            return fabs(conv);
+        }
+
+        __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
+        {
+            float cache = 0;
+            float cache2 = 0;
+            int winsz2 = winsz/2;
+
+            for(int i = 1; i <= winsz2; i++)
+                cache += cols[i];
+
+            cols_cache[0] = cache;
+
+            __syncthreads();
+
+            if (threadIdx.x < blockDim.x - winsz2)
+                cache2 = cols_cache[winsz2];
+            else
+                for(int i = winsz2 + 1; i < winsz; i++)
+                    cache2 += cols[i];
+
+            return cols[0] + cache + cache2;
+        }
+
+        #define RpT (2 * ROWSperTHREAD)  // got experimentally
+
+        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
+        {
+            int winsz2 = winsz/2;
+            int n_dirty_pixels = (winsz2) * 2;
+
+            extern __shared__ float cols_cache[];
+            float *cols = cols_cache + blockDim.x + threadIdx.x;
+            float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
+
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int beg_row = blockIdx.y * RpT;
+            int end_row = ::min(beg_row + RpT, disp.rows);
+
+            if (x < disp.cols)
+            {
+                int y = beg_row;
+
+                float sum = 0;
+                float sum_extra = 0;
+
+                for(int i = y - winsz2; i <= y + winsz2; ++i)
+                {
+                    sum += sobel(x - winsz2, i);
+                    if (cols_extra)
+                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                }
+                *cols = sum;
+                if (cols_extra)
+                    *cols_extra = sum_extra;
+
+                __syncthreads();
+
+                float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                if (sum_win < threshold)
+                    disp.data[y * disp.step + x] = 0;
+
+                __syncthreads();
+
+                for(int y = beg_row + 1; y < end_row; ++y)
+                {
+                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    *cols = sum;
+
+                    if (cols_extra)
+                    {
+                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        *cols_extra = sum_extra;
+                    }
+
+                    __syncthreads();
+                    float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                    if (sum_win < threshold)
+                        disp.data[y * disp.step + x] = 0;
+
+                    __syncthreads();
+                }
+            }
+        }
+
+        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
+        {
+            avgTexturenessThreshold *= winsz * winsz;
+
+            texForTF.filterMode     = cudaFilterModeLinear;
+            texForTF.addressMode[0] = cudaAddressModeWrap;
+            texForTF.addressMode[1] = cudaAddressModeWrap;
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
+
+            dim3 threads(128, 1, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, RpT);
+
+            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture (texForTF) );
+        }
+    } // namespace stereobm
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudastereo/src/cuda/stereobp.cu
+++ b/modules/cudastereo/src/cuda/stereobp.cu
@@ -0,0 +1,538 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobp
+    {
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// load constants ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        __constant__ int   cndisp;
+        __constant__ float cmax_data_term;
+        __constant__ float cdata_weight;
+        __constant__ float cmax_disc_term;
+        __constant__ float cdisc_single_jump;
+
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////////// comp data //////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <int cn> struct PixDiff;
+        template <> struct PixDiff<1>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *ls;
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                return ::abs((int)l - *rs);
+            }
+            uchar l;
+        };
+        template <> struct PixDiff<3>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *((uchar3*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                float val  = tb * ::abs((int)l.x - rs[0]);
+                      val += tg * ::abs((int)l.y - rs[1]);
+                      val += tr * ::abs((int)l.z - rs[2]);
+
+                return val;
+            }
+            uchar3 l;
+        };
+        template <> struct PixDiff<4>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *((uchar4*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                uchar4 r = *((uchar4*)rs);
+
+                float val  = tb * ::abs((int)l.x - r.x);
+                      val += tg * ::abs((int)l.y - r.y);
+                      val += tr * ::abs((int)l.z - r.z);
+
+                return val;
+            }
+            uchar4 l;
+        };
+
+        template <int cn, typename D>
+        __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
+            {
+                const uchar* ls = left.ptr(y) + x * cn;
+                const PixDiff<cn> pixDiff(ls);
+                const uchar* rs = right.ptr(y) + x * cn;
+
+                D* ds = data.ptr(y) + x;
+                const size_t disp_step = data.step * left.rows / sizeof(D);
+
+                for (int disp = 0; disp < cndisp; disp++)
+                {
+                    if (x - disp >= 1)
+                    {
+                        float val = pixDiff(rs - disp * cn);
+
+                        ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+                    }
+                    else
+                    {
+                        ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
+                    }
+                }
+            }
+        }
+
+        template<typename T, typename D>
+        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+
+        template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// data step down ///////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst_cols && y < dst_rows)
+            {
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+
+                    dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+                }
+            }
+        }
+
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
+
+            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////// level up messages  ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst_cols && y < dst_rows)
+            {
+                const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
+                const size_t src_disp_step = src.step * src_rows / sizeof(T);
+
+                T*       dstr = dst.ptr(y  ) + x;
+                const T* srcr = src.ptr(y/2) + x/2;
+
+                for (int d = 0; d < cndisp; ++d)
+                    dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+            }
+        }
+
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
+
+            int src_idx = (dst_idx + 1) & 1;
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __device__ void calc_min_linear_penalty(T* dst, size_t step)
+        {
+            float prev = dst[0];
+            float cur;
+            for (int disp = 1; disp < cndisp; ++disp)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+
+            prev = dst[(cndisp - 1) * step];
+            for (int disp = cndisp - 2; disp >= 0; disp--)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+        }
+
+        template <typename T>
+        __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
+        {
+            float minimum = device::numeric_limits<float>::max();
+
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg  = msg1[msg_disp_step * i];
+                      dst_reg += msg2[msg_disp_step * i];
+                      dst_reg += msg3[msg_disp_step * i];
+                      dst_reg += data[data_disp_step * i];
+
+                if (dst_reg < minimum)
+                    minimum = dst_reg;
+
+                dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
+            }
+
+            calc_min_linear_penalty(dst, msg_disp_step);
+
+            minimum += cmax_disc_term;
+
+            float sum = 0;
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg = dst[msg_disp_step * i];
+                if (dst_reg > minimum)
+                {
+                    dst_reg = minimum;
+                    dst[msg_disp_step * i] = saturate_cast<T>(minimum);
+                }
+                sum += dst_reg;
+            }
+            sum /= cndisp;
+
+            for(int i = 0; i < cndisp; ++i)
+                dst[msg_disp_step * i] -= sum;
+        }
+
+        template <typename T>
+        __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+            {
+                T* us = u + y * elem_step + x;
+                T* ds = d + y * elem_step + x;
+                T* ls = l + y * elem_step + x;
+                T* rs = r + y * elem_step + x;
+                const T* dt = data.ptr(y) + x;
+
+                size_t msg_disp_step = elem_step * rows;
+                size_t data_disp_step = data.step * rows / sizeof(T);
+
+                message(us + elem_step, ls         + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
+                message(ds - elem_step, ls         + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
+                message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
+                message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
+            }
+        }
+
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
+            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(cols, threads.x << 1);
+            grid.y = divUp(rows, threads.y);
+
+            int elem_step = (int)(u.step / sizeof(T));
+
+            for(int t = 0; t < iters; ++t)
+            {
+                one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+
+        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
+            PtrStepSz<short> disp)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
+            {
+                const T* us = u + (y + 1) * elem_step + x;
+                const T* ds = d + (y - 1) * elem_step + x;
+                const T* ls = l + y * elem_step + (x + 1);
+                const T* rs = r + y * elem_step+ (x - 1);
+                const T* dt = data + y * elem_step + x;
+
+                size_t disp_step = disp.rows * elem_step;
+
+                int best = 0;
+                float best_val = numeric_limits<float>::max();
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float val  = us[d * disp_step];
+                          val += ds[d * disp_step];
+                          val += ls[d * disp_step];
+                          val += rs[d * disp_step];
+                          val += dt[d * disp_step];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = d;
+                    }
+                }
+
+                disp.ptr(y)[x] = saturate_cast<short>(best);
+            }
+        }
+
+        template <typename T>
+        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
+            const PtrStepSz<short>& disp, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
+
+            int elem_step = static_cast<int>(u.step/sizeof(T));
+
+            output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+    } // namespace stereobp
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudastereo/src/cuda/stereocsbp.cu
+++ b/modules/cudastereo/src/cuda/stereocsbp.cu
@@ -0,0 +1,864 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereocsbp
+    {
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// load constants ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        __constant__ int cndisp;
+
+        __constant__ float cmax_data_term;
+        __constant__ float cdata_weight;
+        __constant__ float cmax_disc_term;
+        __constant__ float cdisc_single_jump;
+
+        __constant__ int cth;
+
+        __constant__ size_t cimg_step;
+        __constant__ size_t cmsg_step;
+        __constant__ size_t cdisp_step1;
+        __constant__ size_t cdisp_step2;
+
+        __constant__ uchar* cleft;
+        __constant__ uchar* cright;
+        __constant__ uchar* ctemp;
+
+
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
+                            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
+            cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// init data cost ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <int channels> struct DataCostPerPixel;
+        template <> struct DataCostPerPixel<1>
+        {
+            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
+            {
+                return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
+            }
+        };
+        template <> struct DataCostPerPixel<3>
+        {
+            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
+            {
+                float tb = 0.114f * ::abs((int)left[0] - right[0]);
+                float tg = 0.587f * ::abs((int)left[1] - right[1]);
+                float tr = 0.299f * ::abs((int)left[2] - right[2]);
+
+                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
+            }
+        };
+        template <> struct DataCostPerPixel<4>
+        {
+            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
+            {
+                uchar4 l = *((const uchar4*)left);
+                uchar4 r = *((const uchar4*)right);
+
+                float tb = 0.114f * ::abs((int)l.x - r.x);
+                float tg = 0.587f * ::abs((int)l.y - r.y);
+                float tr = 0.299f * ::abs((int)l.z - r.z);
+
+                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
+            }
+        };
+
+        template <typename T>
+        __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
+                T* data_cost = (T*)ctemp + y * cmsg_step + x;
+
+                for(int i = 0; i < nr_plane; i++)
+                {
+                    T minimum = device::numeric_limits<T>::max();
+                    int id = 0;
+                    for(int d = 0; d < cndisp; d++)
+                    {
+                        T cur = data_cost[d * cdisp_step1];
+                        if(cur < minimum)
+                        {
+                            minimum = cur;
+                            id = d;
+                        }
+                    }
+
+                    data_cost_selected[i  * cdisp_step1] = minimum;
+                    selected_disparity[i  * cdisp_step1] = id;
+                    data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
+                }
+            }
+        }
+
+
+        template <typename T>
+        __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
+                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
+                T* data_cost = (T*)ctemp + y * cmsg_step + x;
+
+                int nr_local_minimum = 0;
+
+                T prev = data_cost[0 * cdisp_step1];
+                T cur  = data_cost[1 * cdisp_step1];
+                T next = data_cost[2 * cdisp_step1];
+
+                for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
+                {
+                    if (cur < prev && cur < next)
+                    {
+                        data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
+                        selected_disparity[nr_local_minimum * cdisp_step1] = d;
+
+                        data_cost[d * cdisp_step1] = numeric_limits<T>::max();
+
+                        nr_local_minimum++;
+                    }
+                    prev = cur;
+                    cur = next;
+                    next = data_cost[(d + 1) * cdisp_step1];
+                }
+
+                for (int i = nr_local_minimum; i < nr_plane; i++)
+                {
+                    T minimum = numeric_limits<T>::max();
+                    int id = 0;
+
+                    for (int d = 0; d < cndisp; d++)
+                    {
+                        cur = data_cost[d * cdisp_step1];
+                        if (cur < minimum)
+                        {
+                            minimum = cur;
+                            id = d;
+                        }
+                    }
+                    data_cost_selected[i * cdisp_step1] = minimum;
+                    selected_disparity[i * cdisp_step1] = id;
+
+                    data_cost[id * cdisp_step1] = numeric_limits<T>::max();
+                }
+            }
+        }
+
+        template <typename T, int channels>
+        __global__ void init_data_cost(int h, int w, int level)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                int y0 = y << level;
+                int yt = (y + 1) << level;
+
+                int x0 = x << level;
+                int xt = (x + 1) << level;
+
+                T* data_cost = (T*)ctemp + y * cmsg_step + x;
+
+                for(int d = 0; d < cndisp; ++d)
+                {
+                    float val = 0.0f;
+                    for(int yi = y0; yi < yt; yi++)
+                    {
+                        for(int xi = x0; xi < xt; xi++)
+                        {
+                            int xr = xi - d;
+                            if(d < cth || xr < 0)
+                                val += cdata_weight * cmax_data_term;
+                            else
+                            {
+                                const uchar* lle = cleft + yi * cimg_step + xi * channels;
+                                const uchar* lri = cright + yi * cimg_step + xr * channels;
+
+                                val += DataCostPerPixel<channels>::compute(lle, lri);
+                            }
+                        }
+                    }
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                }
+            }
+        }
+
+        template <typename T, int winsz, int channels>
+        __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
+        {
+            int x_out = blockIdx.x;
+            int y_out = blockIdx.y % h;
+            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
+
+            int tid = threadIdx.x;
+
+            if (d < cndisp)
+            {
+                int x0 = x_out << level;
+                int y0 = y_out << level;
+
+                int len = ::min(y0 + winsz, rows) - y0;
+
+                float val = 0.0f;
+                if (x0 + tid < cols)
+                {
+                    if (x0 + tid - d < 0 || d < cth)
+                        val = cdata_weight * cmax_data_term * len;
+                    else
+                    {
+                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
+                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
+
+                        for(int y = 0; y < len; ++y)
+                        {
+                            val += DataCostPerPixel<channels>::compute(lle, lri);
+
+                            lle += cimg_step;
+                            lri += cimg_step;
+                        }
+                    }
+                }
+
+                extern __shared__ float smem[];
+
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
+
+                T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
+
+                if (tid == 0)
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+            }
+        }
+
+
+        template <typename T>
+        void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            switch (channels)
+            {
+            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
+            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
+            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template <typename T, int winsz>
+        void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
+        {
+            const int threadsNum = 256;
+            const size_t smem_size = threadsNum * sizeof(float);
+
+            dim3 threads(winsz, 1, threadsNum / winsz);
+            dim3 grid(w, h, 1);
+            grid.y *= divUp(ndisp, threads.z);
+
+            switch (channels)
+            {
+            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template<class T>
+        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
+        {
+
+            typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
+
+            static const InitDataCostCaller init_data_cost_callers[] =
+            {
+                init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
+                init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
+                init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
+            };
+
+            size_t disp_step = msg_step * h;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
+
+            init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            if (use_local_init_data_cost == true)
+                get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+            else
+                get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+
+        template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////// compute data cost //////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T, int channels>
+        __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                int y0 = y << level;
+                int yt = (y + 1) << level;
+
+                int x0 = x << level;
+                int xt = (x + 1) << level;
+
+                const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
+                T* data_cost = data_cost_ + y * cmsg_step + x;
+
+                for(int d = 0; d < nr_plane; d++)
+                {
+                    float val = 0.0f;
+                    for(int yi = y0; yi < yt; yi++)
+                    {
+                        for(int xi = x0; xi < xt; xi++)
+                        {
+                            int sel_disp = selected_disparity[d * cdisp_step2];
+                            int xr = xi - sel_disp;
+
+                            if (xr < 0 || sel_disp < cth)
+                                val += cdata_weight * cmax_data_term;
+                            else
+                            {
+                                const uchar* left_x = cleft + yi * cimg_step + xi * channels;
+                                const uchar* right_x = cright + yi * cimg_step + xr * channels;
+
+                                val += DataCostPerPixel<channels>::compute(left_x, right_x);
+                            }
+                        }
+                    }
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+                }
+            }
+        }
+
+        template <typename T, int winsz, int channels>
+        __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
+        {
+            int x_out = blockIdx.x;
+            int y_out = blockIdx.y % h;
+            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
+
+            int tid = threadIdx.x;
+
+            const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
+            T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
+
+            if (d < nr_plane)
+            {
+                int sel_disp = selected_disparity[d * cdisp_step2];
+
+                int x0 = x_out << level;
+                int y0 = y_out << level;
+
+                int len = ::min(y0 + winsz, rows) - y0;
+
+                float val = 0.0f;
+                if (x0 + tid < cols)
+                {
+                    if (x0 + tid - sel_disp < 0 || sel_disp < cth)
+                        val = cdata_weight * cmax_data_term * len;
+                    else
+                    {
+                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
+                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
+
+                        for(int y = 0; y < len; ++y)
+                        {
+                            val += DataCostPerPixel<channels>::compute(lle, lri);
+
+                            lle += cimg_step;
+                            lri += cimg_step;
+                        }
+                    }
+                }
+
+                extern __shared__ float smem[];
+
+                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
+
+                if (tid == 0)
+                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
+            }
+        }
+
+        template <typename T>
+        void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
+                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            switch(channels)
+            {
+            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template <typename T, int winsz>
+        void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
+        {
+            const int threadsNum = 256;
+            const size_t smem_size = threadsNum * sizeof(float);
+
+            dim3 threads(winsz, 1, threadsNum / winsz);
+            dim3 grid(w, h, 1);
+            grid.y *= divUp(nr_plane, threads.z);
+
+            switch (channels)
+            {
+            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
+            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+        }
+
+        template<class T>
+        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
+        {
+            typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
+                int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
+
+            static const ComputeDataCostCaller callers[] =
+            {
+                compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
+                compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
+                compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
+            };
+
+            size_t disp_step1 = msg_step * h;
+            size_t disp_step2 = msg_step * h2;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
+
+            callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+
+        template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+
+
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// init message /////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+
+         template <typename T>
+        __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
+                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                                                     T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
+                                                     const T* data_cost_cur, const T* disparity_selected_cur,
+                                                     int nr_plane, int nr_plane2)
+        {
+            for(int i = 0; i < nr_plane; i++)
+            {
+                T minimum = numeric_limits<T>::max();
+                int id = 0;
+                for(int j = 0; j < nr_plane2; j++)
+                {
+                    T cur = data_cost_new[j * cdisp_step1];
+                    if(cur < minimum)
+                    {
+                        minimum = cur;
+                        id = j;
+                    }
+                }
+
+                data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
+                disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
+
+                u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
+                d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
+                l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
+                r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
+
+                data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
+            }
+        }
+
+        template <typename T>
+        __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
+                                     const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
+                                     T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                                     T* data_cost_selected_, const T* data_cost_,
+                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < h && x < w)
+            {
+                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
+                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step + x/2;
+                const T* l_cur = l_cur_ + (y/2)                * cmsg_step + ::min(w2-1, x/2 + 1);
+                const T* r_cur = r_cur_ + (y/2)                * cmsg_step + ::max(0, x/2 - 1);
+
+                T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
+
+                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
+                const T* data_cost = data_cost_ + y * cmsg_step + x;
+
+                for(int d = 0; d < nr_plane2; d++)
+                {
+                    int idx2 = d * cdisp_step2;
+
+                    T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
+                    data_cost_new[d * cdisp_step1] = val;
+                }
+
+                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
+                T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
+
+                T* u_new = u_new_ + y * cmsg_step + x;
+                T* d_new = d_new_ + y * cmsg_step + x;
+                T* l_new = l_new_ + y * cmsg_step + x;
+                T* r_new = r_new_ + y * cmsg_step + x;
+
+                u_cur = u_cur_ + y/2 * cmsg_step + x/2;
+                d_cur = d_cur_ + y/2 * cmsg_step + x/2;
+                l_cur = l_cur_ + y/2 * cmsg_step + x/2;
+                r_cur = r_cur_ + y/2 * cmsg_step + x/2;
+
+                get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
+                                             data_cost_selected, disparity_selected_new, data_cost_new,
+                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2);
+            }
+        }
+
+
+        template<class T>
+        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
+        {
+
+            size_t disp_step1 = msg_step * h;
+            size_t disp_step2 = msg_step * h2;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x);
+            grid.y = divUp(h, threads.y);
+
+            init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
+                                                       u_cur, d_cur, l_cur, r_cur,
+                                                       selected_disp_pyr_new, selected_disp_pyr_cur,
+                                                       data_cost_selected, data_cost,
+                                                       h, w, nr_plane, h2, w2, nr_plane2);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
+                          const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
+                          short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
+                          short* data_cost_selected, const short* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
+                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
+                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
+                          float* data_cost_selected, const float* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
+                                          const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
+        {
+            T minimum = numeric_limits<T>::max();
+
+            for(int d = 0; d < nr_plane; d++)
+            {
+                int idx = d * cdisp_step1;
+                T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
+
+                if(val < minimum)
+                    minimum = val;
+
+                msg_dst[idx] = val;
+            }
+
+            float sum = 0;
+            for(int d = 0; d < nr_plane; d++)
+            {
+                float cost_min = minimum + cmax_disc_term;
+                T src_disp_reg = src_disp[d * cdisp_step1];
+
+                for(int d2 = 0; d2 < nr_plane; d2++)
+                    cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
+
+                temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
+                sum += cost_min;
+            }
+            sum /= nr_plane;
+
+            for(int d = 0; d < nr_plane; d++)
+                msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
+        }
+
+        template <typename T>
+        __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
+        {
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+            int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
+
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                const T* data = data_cost_selected + y * cmsg_step + x;
+
+                T* u = u_ + y * cmsg_step + x;
+                T* d = d_ + y * cmsg_step + x;
+                T* l = l_ + y * cmsg_step + x;
+                T* r = r_ + y * cmsg_step + x;
+
+                const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
+
+                T* temp = (T*)ctemp + y * cmsg_step + x;
+
+                message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
+                message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
+                message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
+                message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
+            }
+        }
+
+
+        template<class T>
+        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
+        {
+            size_t disp_step = msg_step * h;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(w, threads.x << 1);
+            grid.y = divUp(h, threads.y);
+
+            for(int t = 0; t < iters; ++t)
+            {
+                compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
+                cudaSafeCall( cudaGetLastError() );
+            }
+            if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+        };
+
+        template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, cudaStream_t stream);
+
+        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
+            int h, int w, int nr_plane, int iters, cudaStream_t stream);
+
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+
+        template <typename T>
+        __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
+                                     const T* data_cost_selected, const T* disp_selected_pyr,
+                                     PtrStepSz<short> disp, int nr_plane)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
+            {
+                const T* data = data_cost_selected + y * cmsg_step + x;
+                const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
+
+                const T* u = u_ + (y+1) * cmsg_step + (x+0);
+                const T* d = d_ + (y-1) * cmsg_step + (x+0);
+                const T* l = l_ + (y+0) * cmsg_step + (x+1);
+                const T* r = r_ + (y+0) * cmsg_step + (x-1);
+
+                int best = 0;
+                T best_val = numeric_limits<T>::max();
+                for (int i = 0; i < nr_plane; ++i)
+                {
+                    int idx = i * cdisp_step1;
+                    T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = saturate_cast<short>(disp_selected[idx]);
+                    }
+                }
+                disp(y, x) = best;
+            }
+        }
+
+        template<class T>
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
+        {
+            size_t disp_step = disp.rows * msg_step;
+            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
+
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
+
+            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+
+        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+    } // namespace stereocsbp
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudastereo/src/cuda/util.cu
+++ b/modules/cudastereo/src/cuda/util.cu
@@ -0,0 +1,235 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
+
+    __constant__ float cq[16];
+
+    template <typename T, typename D>
+    __global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= disp.rows || x >= disp.cols)
+            return;
+
+        const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
+        const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
+        const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
+        const float qw = x * cq[12] + y * cq[13] + cq[15];
+
+        const T d = disp(y, x);
+
+        const float iW = 1.f / (qw + cq[14] * d);
+
+        D v = VecTraits<D>::all(1.0f);
+        v.x = (qx + cq[2] * d) * iW;
+        v.y = (qy + cq[6] * d) * iW;
+        v.z = (qz + cq[10] * d) * iW;
+
+        xyz(y, x) = v;
+    }
+
+    template <typename T, typename D>
+    void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
+
+        cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
+
+        reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+
+    /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
+
+    template <typename T>
+    __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
+    {
+        unsigned int H = ((ndisp-d) * 240)/ndisp;
+
+        unsigned int hi = (H/60) % 6;
+        float f = H/60.f - H/60;
+        float p = V * (1 - S);
+        float q = V * (1 - f * S);
+        float t = V * (1 - (1 - f) * S);
+
+        float3 res;
+
+        if (hi == 0) //R = V,	G = t,	B = p
+        {
+            res.x = p;
+            res.y = t;
+            res.z = V;
+        }
+
+        if (hi == 1) // R = q,	G = V,	B = p
+        {
+            res.x = p;
+            res.y = V;
+            res.z = q;
+        }
+
+        if (hi == 2) // R = p,	G = V,	B = t
+        {
+            res.x = t;
+            res.y = V;
+            res.z = p;
+        }
+
+        if (hi == 3) // R = p,	G = q,	B = V
+        {
+            res.x = V;
+            res.y = q;
+            res.z = p;
+        }
+
+        if (hi == 4) // R = t,	G = p,	B = V
+        {
+            res.x = V;
+            res.y = p;
+            res.z = t;
+        }
+
+        if (hi == 5) // R = V,	G = p,	B = q
+        {
+            res.x = q;
+            res.y = p;
+            res.z = V;
+        }
+        const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
+        const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
+        const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
+        const unsigned int a = 255U;
+
+        return (a << 24) + (r << 16) + (g << 8) + b;
+    }
+
+    __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
+
+            uint4 res;
+            res.x = cvtPixel(d4.x, ndisp);
+            res.y = cvtPixel(d4.y, ndisp);
+            res.z = cvtPixel(d4.z, ndisp);
+            res.w = cvtPixel(d4.w, ndisp);
+
+            uint4* line = (uint4*)(out_image + y * out_step);
+            line[x >> 2] = res;
+        }
+    }
+
+    __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
+    {
+        const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if(x < width && y < height)
+        {
+            short2 d2 = *(short2*)(disp + y * disp_step + x);
+
+            uint2 res;
+            res.x = cvtPixel(d2.x, ndisp);
+            res.y = cvtPixel(d2.y, ndisp);
+
+            uint2* line = (uint2*)(out_image + y * out_step);
+            line[x >> 1] = res;
+        }
+    }
+
+
+    void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(16, 16, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x << 2);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(src.cols, threads.x << 1);
+        grid.y = divUp(src.rows, threads.y);
+
+        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudastereo/src/disparity_bilateral_filter.cpp
+++ b/modules/cudastereo/src/disparity_bilateral_filter.cpp
@@ -0,0 +1,206 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int, int, int) { throw_no_cuda(); return Ptr<cuda::DisparityBilateralFilter>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace disp_bilateral_filter
+    {
+        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
+
+        template<typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class DispBilateralFilterImpl : public cuda::DisparityBilateralFilter
+    {
+    public:
+        DispBilateralFilterImpl(int ndisp, int radius, int iters);
+
+        void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream);
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getRadius() const { return radius_; }
+        void setRadius(int radius);
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        double getEdgeThreshold() const { return edge_threshold_; }
+        void setEdgeThreshold(double edge_threshold) { edge_threshold_ = (float) edge_threshold; }
+
+        double getMaxDiscThreshold() const { return max_disc_threshold_; }
+        void setMaxDiscThreshold(double max_disc_threshold) { max_disc_threshold_ = (float) max_disc_threshold; }
+
+        double getSigmaRange() const { return sigma_range_; }
+        void setSigmaRange(double sigma_range);
+
+    private:
+        int ndisp_;
+        int radius_;
+        int iters_;
+        float edge_threshold_;
+        float max_disc_threshold_;
+        float sigma_range_;
+
+        GpuMat table_color_;
+        GpuMat table_space_;
+    };
+
+    void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+    {
+        Mat cpu_table_color(1, len, CV_32F);
+
+        float* line = cpu_table_color.ptr<float>();
+
+        for(int i = 0; i < len; i++)
+            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
+
+        table_color.upload(cpu_table_color);
+    }
+
+    void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    {
+        int half = (win_size >> 1);
+
+        Mat cpu_table_space(half + 1, half + 1, CV_32F);
+
+        for (int y = 0; y <= half; ++y)
+        {
+            float* row = cpu_table_space.ptr<float>(y);
+            for (int x = 0; x <= half; ++x)
+                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
+        }
+
+        table_space.upload(cpu_table_space);
+    }
+
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    DispBilateralFilterImpl::DispBilateralFilterImpl(int ndisp, int radius, int iters) :
+        ndisp_(ndisp), radius_(radius), iters_(iters),
+        edge_threshold_(DEFAULT_EDGE_THRESHOLD), max_disc_threshold_(DEFAULT_MAX_DISC_THRESHOLD),
+        sigma_range_(DEFAULT_SIGMA_RANGE)
+    {
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setRadius(int radius)
+    {
+        radius_ = radius;
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setSigmaRange(double sigma_range)
+    {
+        sigma_range_ = (float) sigma_range;
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                        GpuMat& table_color, GpuMat& table_space,
+                                        const GpuMat& disp, const GpuMat& img,
+                                        OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::cuda::device::disp_bilateral_filter;
+
+        const short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        const short max_disc = short(ndisp * max_disc_threshold + 0.5);
+
+        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+
+        _dst.create(disp.size(), disp.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        if (dst.data != disp.data)
+            disp.copyTo(dst, stream);
+
+        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+    }
+
+    void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
+    {
+        typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                    GpuMat& table_color, GpuMat& table_space,
+                                                    const GpuMat& disp, const GpuMat& img, OutputArray dst, Stream& stream);
+        const bilateral_filter_operator_t operators[] =
+            {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+
+        CV_Assert( 0 < ndisp_ && 0 < radius_ && 0 < iters_ );
+
+        GpuMat disp = _disp.getGpuMat();
+        GpuMat img = _image.getGpuMat();
+
+        CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 );
+        CV_Assert( disp.size() == img.size() );
+
+        operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
+                               table_color_, table_space_, disp, img, dst, stream);
+    }
+}
+
+Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int ndisp, int radius, int iters)
+{
+    return new DispBilateralFilterImpl(ndisp, radius, iters);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudastereo/src/precomp.cpp
+++ b/modules/cudastereo/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/cudastereo/src/precomp.hpp
+++ b/modules/cudastereo/src/precomp.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudastereo.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudastereo/src/stereobm.cpp
+++ b/modules/cudastereo/src/stereobm.cpp
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::StereoBM> cv::cuda::createStereoBM(int, int) { throw_no_cuda(); return Ptr<cuda::StereoBM>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobm
+    {
+        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
+        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
+        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
+    }
+}}}
+
+namespace
+{
+    class StereoBMImpl : public cuda::StereoBM
+    {
+    public:
+        StereoBMImpl(int numDisparities, int blockSize);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return winSize_; }
+        void setBlockSize(int blockSize) { winSize_ = blockSize; }
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getPreFilterType() const { return preset_; }
+        void setPreFilterType(int preFilterType) { preset_ = preFilterType; }
+
+        int getPreFilterSize() const { return 0; }
+        void setPreFilterSize(int /*preFilterSize*/) {}
+
+        int getPreFilterCap() const { return preFilterCap_; }
+        void setPreFilterCap(int preFilterCap) { preFilterCap_ = preFilterCap; }
+
+        int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
+        void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }
+
+        int getUniquenessRatio() const { return 0; }
+        void setUniquenessRatio(int /*uniquenessRatio*/) {}
+
+        int getSmallerBlockSize() const { return 0; }
+        void setSmallerBlockSize(int /*blockSize*/){}
+
+        Rect getROI1() const { return Rect(); }
+        void setROI1(Rect /*roi1*/) {}
+
+        Rect getROI2() const { return Rect(); }
+        void setROI2(Rect /*roi2*/) {}
+
+    private:
+        int preset_;
+        int ndisp_;
+        int winSize_;
+        int preFilterCap_;
+        float avergeTexThreshold_;
+
+        GpuMat minSSD_, leBuf_, riBuf_;
+    };
+
+    StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
+        : preset_(0), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3)
+    {
+    }
+
+    void StereoBMImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBMImpl::compute(InputArray _left, InputArray _right, OutputArray _disparity, Stream& _stream)
+    {
+        using namespace ::cv::cuda::device::stereobm;
+
+        const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
+        CV_Assert( 0 < ndisp_ && ndisp_ <= max_supported_ndisp );
+        CV_Assert( ndisp_ % 8 == 0 );
+        CV_Assert( winSize_ % 2 == 1 );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        _disparity.create(left.size(), CV_8UC1);
+        GpuMat disparity = _disparity.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        cuda::ensureSizeIsEnough(left.size(), CV_32SC1, minSSD_);
+
+        PtrStepSzb le_for_bm =  left;
+        PtrStepSzb ri_for_bm = right;
+
+        if (preset_ == cv::StereoBM::PREFILTER_XSOBEL)
+        {
+            cuda::ensureSizeIsEnough(left.size(), left.type(), leBuf_);
+            cuda::ensureSizeIsEnough(right.size(), right.type(), riBuf_);
+
+            prefilter_xsobel( left, leBuf_, preFilterCap_, stream);
+            prefilter_xsobel(right, riBuf_, preFilterCap_, stream);
+
+            le_for_bm = leBuf_;
+            ri_for_bm = riBuf_;
+        }
+
+        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
+
+        if (avergeTexThreshold_ > 0)
+            postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
+    }
+}
+
+Ptr<cuda::StereoBM> cv::cuda::createStereoBM(int numDisparities, int blockSize)
+{
+    return new StereoBMImpl(numDisparities, blockSize);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudastereo/src/stereobp.cpp
+++ b/modules/cudastereo/src/stereobp.cpp
@@ -0,0 +1,380 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, int&, int&) { throw_no_cuda(); }
+
+Ptr<cuda::StereoBeliefPropagation> cv::cuda::createStereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); return Ptr<cuda::StereoBeliefPropagation>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereobp
+    {
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
+        template<typename T, typename D>
+        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
+            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        template <typename T>
+        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
+            const PtrStepSz<short>& disp, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class StereoBPImpl : public cuda::StereoBeliefPropagation
+    {
+    public:
+        StereoBPImpl(int ndisp, int iters, int levels, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+    private:
+        void init(Stream& stream);
+        void calcBP(OutputArray disp, Stream& stream);
+
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+
+        float scale_;
+        int rows_, cols_;
+        std::vector<int> cols_all_, rows_all_;
+        GpuMat u_, d_, l_, r_, u2_, d2_, l2_, r2_;
+        std::vector<GpuMat> datas_;
+        GpuMat outBuf_;
+    };
+
+    const float DEFAULT_MAX_DATA_TERM = 10.0f;
+    const float DEFAULT_DATA_WEIGHT = 0.07f;
+    const float DEFAULT_MAX_DISC_TERM = 1.7f;
+    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
+
+    StereoBPImpl::StereoBPImpl(int ndisp, int iters, int levels, int msg_type) :
+        ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type)
+    {
+    }
+
+    void StereoBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBPImpl::compute(InputArray _left, InputArray _right, OutputArray disparity, Stream& stream)
+    {
+        using namespace cv::cuda::device::stereobp;
+
+        typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+        static const comp_data_t comp_data_callers[2][5] =
+        {
+            {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
+            {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
+        };
+
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        rows_ = left.rows;
+        cols_ = left.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        datas_[0].create(rows_ * ndisp_, cols_, msg_type_);
+
+        comp_data_callers[msg_type_ == CV_32F][left.channels()](left, right, datas_[0], StreamAccessor::getStream(stream));
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::compute(InputArray _data, OutputArray disparity, Stream& stream)
+    {
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat data = _data.getGpuMat();
+
+        CV_Assert( (data.type() == msg_type_) && (data.rows % ndisp_ == 0) );
+
+        rows_ = data.rows / ndisp_;
+        cols_ = data.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        data.copyTo(datas_[0], stream);
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::init(Stream& stream)
+    {
+        using namespace cv::cuda::device::stereobp;
+
+        u_.create(rows_ * ndisp_, cols_, msg_type_);
+        d_.create(rows_ * ndisp_, cols_, msg_type_);
+        l_.create(rows_ * ndisp_, cols_, msg_type_);
+        r_.create(rows_ * ndisp_, cols_, msg_type_);
+
+        if (levels_ & 1)
+        {
+            //can clear less area
+            u_.setTo(0, stream);
+            d_.setTo(0, stream);
+            l_.setTo(0, stream);
+            r_.setTo(0, stream);
+        }
+
+        if (levels_ > 1)
+        {
+            int less_rows = (rows_ + 1) / 2;
+            int less_cols = (cols_ + 1) / 2;
+
+            u2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            d2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            l2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            r2_.create(less_rows * ndisp_, less_cols, msg_type_);
+
+            if ((levels_ & 1) == 0)
+            {
+                u2_.setTo(0, stream);
+                d2_.setTo(0, stream);
+                l2_.setTo(0, stream);
+                r2_.setTo(0, stream);
+            }
+        }
+
+        load_constants(ndisp_, max_data_term_, scale_ * data_weight_, scale_ * max_disc_term_, scale_ * disc_single_jump_);
+
+        datas_.resize(levels_);
+
+        cols_all_.resize(levels_);
+        rows_all_.resize(levels_);
+
+        cols_all_[0] = cols_;
+        rows_all_[0] = rows_;
+    }
+
+    void StereoBPImpl::calcBP(OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::cuda::device::stereobp;
+
+        typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        static const data_step_down_t data_step_down_callers[2] =
+        {
+            data_step_down_gpu<short>, data_step_down_gpu<float>
+        };
+
+        typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        static const level_up_messages_t level_up_messages_callers[2] =
+        {
+            level_up_messages_gpu<short>, level_up_messages_gpu<float>
+        };
+
+        typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        static const calc_all_iterations_t calc_all_iterations_callers[2] =
+        {
+            calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
+        };
+
+        typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        static const output_t output_callers[2] =
+        {
+            output_gpu<short>, output_gpu<float>
+        };
+
+        const int funcIdx = msg_type_ == CV_32F;
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        for (int i = 1; i < levels_; ++i)
+        {
+            cols_all_[i] = (cols_all_[i-1] + 1) / 2;
+            rows_all_[i] = (rows_all_[i-1] + 1) / 2;
+
+            datas_[i].create(rows_all_[i] * ndisp_, cols_all_[i], msg_type_);
+
+            data_step_down_callers[funcIdx](cols_all_[i], rows_all_[i], rows_all_[i-1], datas_[i-1], datas_[i], stream);
+        }
+
+        PtrStepSzb mus[] = {u_, u2_};
+        PtrStepSzb mds[] = {d_, d2_};
+        PtrStepSzb mrs[] = {r_, r2_};
+        PtrStepSzb mls[] = {l_, l2_};
+
+        int mem_idx = (levels_ & 1) ? 0 : 1;
+
+        for (int i = levels_ - 1; i >= 0; --i)
+        {
+            // for lower level we have already computed messages by setting to zero
+            if (i != levels_ - 1)
+                level_up_messages_callers[funcIdx](mem_idx, cols_all_[i], rows_all_[i], rows_all_[i+1], mus, mds, mls, mrs, stream);
+
+            calc_all_iterations_callers[funcIdx](cols_all_[i], rows_all_[i], iters_, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas_[i], stream);
+
+            mem_idx = (mem_idx + 1) & 1;
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows_, cols_, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows_, cols_, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        output_callers[funcIdx](u_, d_, l_, r_, datas_.front(), out, stream);
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+}
+
+Ptr<cuda::StereoBeliefPropagation> cv::cuda::createStereoBeliefPropagation(int ndisp, int iters, int levels, int msg_type)
+{
+    return new StereoBPImpl(ndisp, iters, levels, msg_type);
+}
+
+void cv::cuda::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
+{
+    ndisp = width / 4;
+    if ((ndisp & 1) != 0)
+        ndisp++;
+
+    int mm = std::max(width, height);
+    iters = mm / 100 + 2;
+
+    levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
+    if (levels == 0) levels++;
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudastereo/src/stereocsbp.cpp
+++ b/modules/cudastereo/src/stereocsbp.cpp
@@ -0,0 +1,387 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, int&, int&, int&) { throw_no_cuda(); }
+
+Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); return Ptr<cuda::StereoConstantSpaceBP>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace stereocsbp
+    {
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
+            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp);
+
+        template<class T>
+        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+
+        template<class T>
+        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
+                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+
+        template<class T>
+        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
+                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                          T* data_cost_selected, const T* data_cost, size_t msg_step,
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+
+        template<class T>
+        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
+            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
+
+        template<class T>
+        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class StereoCSBPImpl : public cuda::StereoConstantSpaceBP
+    {
+    public:
+        StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return min_disp_th_; }
+        void setMinDisparity(int minDisparity) { min_disp_th_ = minDisparity; }
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+        int getNrPlane() const { return nr_plane_; }
+        void setNrPlane(int nr_plane) { nr_plane_ = nr_plane; }
+
+        bool getUseLocalInitDataCost() const { return use_local_init_data_cost_; }
+        void setUseLocalInitDataCost(bool use_local_init_data_cost) { use_local_init_data_cost_ = use_local_init_data_cost; }
+
+    private:
+        int min_disp_th_;
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+        int nr_plane_;
+        bool use_local_init_data_cost_;
+
+        GpuMat mbuf_;
+        GpuMat temp_;
+        GpuMat outBuf_;
+    };
+
+    const float DEFAULT_MAX_DATA_TERM = 30.0f;
+    const float DEFAULT_DATA_WEIGHT = 1.0f;
+    const float DEFAULT_MAX_DISC_TERM = 160.0f;
+    const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
+
+    StereoCSBPImpl::StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type) :
+        min_disp_th_(0), ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type), nr_plane_(nr_plane), use_local_init_data_cost_(true)
+    {
+    }
+
+    void StereoCSBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoCSBPImpl::compute(InputArray _left, InputArray _right, OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::cuda::device::stereocsbp;
+
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ && 0 < nr_plane_ && levels_ <= 8 );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        // Init
+
+        int rows = left.rows;
+        int cols = left.cols;
+
+        levels_ = std::min(levels_, int(log((double)ndisp_) / log(2.0)));
+
+        // compute sizes
+        AutoBuffer<int> buf(levels_ * 3);
+        int* cols_pyr = buf;
+        int* rows_pyr = cols_pyr + levels_;
+        int* nr_plane_pyr = rows_pyr + levels_;
+
+        cols_pyr[0]     = cols;
+        rows_pyr[0]     = rows;
+        nr_plane_pyr[0] = nr_plane_;
+
+        for (int i = 1; i < levels_; i++)
+        {
+            cols_pyr[i]     = cols_pyr[i-1] / 2;
+            rows_pyr[i]     = rows_pyr[i-1] / 2;
+            nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
+        }
+
+        GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
+
+        //allocate buffers
+        int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
+        buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
+        buffers_count += 1; //  data_cost_selected
+        mbuf_.create(rows * nr_plane_ * buffers_count, cols, msg_type_);
+
+        data_cost          = mbuf_.rowRange(0, rows * nr_plane_ * 2);
+        data_cost_selected = mbuf_.rowRange(data_cost.rows, data_cost.rows + rows * nr_plane_);
+
+        for(int k = 0; k < 2; ++k) // in/out
+        {
+            GpuMat sub1 = mbuf_.rowRange(data_cost.rows + data_cost_selected.rows, mbuf_.rows);
+            GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
+
+            GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
+            for(int _r = 0; _r < 5; ++_r)
+            {
+                *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
+                CV_DbgAssert( buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * nr_plane_ );
+            }
+        };
+
+        size_t elem_step = mbuf_.step / mbuf_.elemSize();
+
+        Size temp_size = data_cost.size();
+        if ((size_t)temp_size.area() < elem_step * rows_pyr[levels_ - 1] * ndisp_)
+            temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels_ - 1] * ndisp_);
+
+        temp_.create(temp_size, msg_type_);
+
+        ////////////////////////////////////////////////////////////////////////////
+        // Compute
+
+        load_constants(ndisp_, max_data_term_, data_weight_, max_disc_term_, disc_single_jump_, min_disp_th_, left, right, temp_);
+
+        l[0].setTo(0, _stream);
+        d[0].setTo(0, _stream);
+        r[0].setTo(0, _stream);
+        u[0].setTo(0, _stream);
+
+        l[1].setTo(0, _stream);
+        d[1].setTo(0, _stream);
+        r[1].setTo(0, _stream);
+        u[1].setTo(0, _stream);
+
+        data_cost.setTo(0, _stream);
+        data_cost_selected.setTo(0, _stream);
+
+        int cur_idx = 0;
+
+        if (msg_type_ == CV_32F)
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
+                                 u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                 disp_selected_pyr[new_idx].ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(),
+                                 data_cost_selected.ptr<float>(), data_cost.ptr<float>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                    data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+            }
+        }
+        else
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
+                                 u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                 disp_selected_pyr[new_idx].ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(),
+                                 data_cost_selected.ptr<short>(), data_cost.ptr<short>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                    data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+            }
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows, cols, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows, cols, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        if (msg_type_ == CV_32F)
+        {
+            compute_disp(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                         data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+        else
+        {
+            compute_disp(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                         data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+
+    void StereoCSBPImpl::compute(InputArray /*data*/, OutputArray /*disparity*/, Stream& /*stream*/)
+    {
+        CV_Error(Error::StsNotImplemented, "Not implemented");
+    }
+}
+
+Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, int msg_type)
+{
+    return new StereoCSBPImpl(ndisp, iters, levels, nr_plane, msg_type);
+}
+
+void cv::cuda::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
+{
+    ndisp = (int) ((float) width / 3.14f);
+    if ((ndisp & 1) != 0)
+        ndisp++;
+
+    int mm = std::max(width, height);
+    iters = mm / 100 + ((mm > 1200)? - 4 : 4);
+
+    levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
+    if (levels == 0) levels++;
+
+    nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudastereo/src/util.cpp
+++ b/modules/cudastereo/src/util.cpp
@@ -0,0 +1,123 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::reprojectImageTo3D(InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::drawColorDisp(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+#else
+
+////////////////////////////////////////////////////////////////////////
+// reprojectImageTo3D
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename D>
+    void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+}}}
+
+void cv::cuda::reprojectImageTo3D(InputArray _disp, OutputArray _xyz, InputArray _Q, int dst_cn, Stream& stream)
+{
+    using namespace cv::cuda::device;
+
+    typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
+    static const func_t funcs[2][4] =
+    {
+        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
+        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
+    };
+
+    GpuMat disp = _disp.getGpuMat();
+    Mat Q = _Q.getMat();
+
+    CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+    CV_Assert( Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous() );
+    CV_Assert( dst_cn == 3 || dst_cn == 4 );
+
+    _xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+    GpuMat xyz = _xyz.getGpuMat();
+
+    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// drawColorDisp
+
+namespace cv { namespace cuda { namespace device
+{
+    void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
+}}}
+
+namespace
+{
+    template <typename T>
+    void drawColorDisp_caller(const GpuMat& src, OutputArray _dst, int ndisp, const cudaStream_t& stream)
+    {
+        using namespace ::cv::cuda::device;
+
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
+
+        drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
+    }
+}
+
+void cv::cuda::drawColorDisp(InputArray _src, OutputArray dst, int ndisp, Stream& stream)
+{
+    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, OutputArray dst, int ndisp, const cudaStream_t& stream);
+    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U || src.type() == CV_16S );
+
+    drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
+}
+
+#endif