renamed gpustereo -> cudastereo

This commit is contained in:
Vladislav Vinogradov
2013-07-23 16:18:26 +04:00
parent d4901a99ea
commit 71d61e07b1
29 changed files with 21 additions and 21 deletions

View File

@@ -0,0 +1,223 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/limits.hpp"
namespace cv { namespace cuda { namespace device
{
namespace disp_bilateral_filter
{
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
__constant__ int cndisp;
__constant__ int cradius;
__constant__ short cedge_disc;
__constant__ short cmax_disc;
void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
}
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
uchar x = ::abs(a[0] - b[0]);
uchar y = ::abs(a[1] - b[1]);
uchar z = ::abs(a[2] - b[2]);
return (::max(::max(x, y), z));
}
};
template <>
struct DistRgbMax<1>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
return ::abs(a[0] - b[0]);
}
};
template <int channels, typename T>
__global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
T dp[5];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
{
const int ymin = ::max(0, y - cradius);
const int xmin = ::max(0, x - cradius);
const int ymax = ::min(h - 1, y + cradius);
const int xmax = ::min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const uchar* ic = img + y * img_step + channels * x;
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
}
}
}
template <typename T>
void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1);
grid.y = divUp(disp.rows, threads.y);
switch (channels)
{
case 1:
for (int i = 0; i < iters; ++i)
{
disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
default:
CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
} // namespace bilateral_filter
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,540 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace cuda { namespace device
{
namespace stereobm
{
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
#define ROWSperTHREAD 21 // the number of rows a thread will process
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
__constant__ unsigned int* cminSSDImage;
__constant__ size_t cminSSD_step;
__constant__ int cwidth;
__constant__ int cheight;
__device__ __forceinline__ int SQ(int a)
{
return a * a;
}
template<int RADIUS>
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
for(int i = 1; i <= RADIUS; i++)
cache += col_ssd[i];
col_ssd_cache[0] = cache;
__syncthreads();
if (threadIdx.x < BLOCK_W - RADIUS)
cache2 = col_ssd_cache[RADIUS];
else
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
cache2 += col_ssd[i];
return col_ssd[0] + cache + cache2;
}
template<int RADIUS>
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
int bestIdx = 0;
for (int i = 0; i < N_DISPARITIES; i++)
{
if (mssd == ssd[i])
bestIdx = i;
}
return make_uint2(mssd, bestIdx);
}
template<int RADIUS>
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
{
unsigned char leftPixel1;
unsigned char leftPixel2;
unsigned char rightPixel1[8];
unsigned char rightPixel2[8];
unsigned int diff1, diff2;
leftPixel1 = imageL[idx1];
leftPixel2 = imageL[idx2];
idx1 = idx1 - d;
idx2 = idx2 - d;
rightPixel1[7] = imageR[idx1 - 7];
rightPixel1[0] = imageR[idx1 - 0];
rightPixel1[1] = imageR[idx1 - 1];
rightPixel1[2] = imageR[idx1 - 2];
rightPixel1[3] = imageR[idx1 - 3];
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[1] = imageR[idx2 - 1];
rightPixel2[2] = imageR[idx2 - 2];
rightPixel2[3] = imageR[idx2 - 3];
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
}
template<int RADIUS>
__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
{
unsigned char leftPixel1;
int idx;
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
for(int i = 0; i < (2 * RADIUS + 1); i++)
{
idx = y_tex * im_pitch + x_tex;
leftPixel1 = imageL[idx];
idx = idx - d;
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
}
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
{
extern __shared__ unsigned int col_ssd_cache[];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp.data + X + Y * disp.step;
/* if (X < cwidth)
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF;
}*/
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
int y_tex;
int x_tex = X - RADIUS;
if (x_tex >= cwidth)
return;
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{
y_tex = Y - RADIUS;
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && Y < cheight - RADIUS)
{
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[0])
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
}
}
for(int row = 1; row < end_row; row++)
{
int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
__syncthreads();
StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
if (col_ssd_extra)
if (x_tex + BLOCK_W < cwidth)
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
y_tex += 1;
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
{
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
{
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
}
}
} // for row loop
} // for d loop
}
template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
{
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
const static kernel_caller_t callers[] =
{
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
{
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
CV_Error(cv::Error::StsBadArg, "Unsupported window size");
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp, stream);
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
__global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < output.cols && y < output.rows)
{
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
output.ptr(y)[x] = conv & 0xFF;
}
}
void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ __forceinline__ float sobel(int x, int y)
{
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
return fabs(conv);
}
__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
{
float cache = 0;
float cache2 = 0;
int winsz2 = winsz/2;
for(int i = 1; i <= winsz2; i++)
cache += cols[i];
cols_cache[0] = cache;
__syncthreads();
if (threadIdx.x < blockDim.x - winsz2)
cache2 = cols_cache[winsz2];
else
for(int i = winsz2 + 1; i < winsz; i++)
cache2 += cols[i];
return cols[0] + cache + cache2;
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
__global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
extern __shared__ float cols_cache[];
float *cols = cols_cache + blockDim.x + threadIdx.x;
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT;
int end_row = ::min(beg_row + RpT, disp.rows);
if (x < disp.cols)
{
int y = beg_row;
float sum = 0;
float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
sum += sobel(x - winsz2, i);
if (cols_extra)
sum_extra += sobel(x + blockDim.x - winsz2, i);
}
*cols = sum;
if (cols_extra)
*cols_extra = sum_extra;
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp.data[y * disp.step + x] = 0;
__syncthreads();
for(int y = beg_row + 1; y < end_row; ++y)
{
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
*cols_extra = sum_extra;
}
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp.data[y * disp.step + x] = 0;
__syncthreads();
}
}
}
void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
{
avgTexturenessThreshold *= winsz * winsz;
texForTF.filterMode = cudaFilterModeLinear;
texForTF.addressMode[0] = cudaAddressModeWrap;
texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
}
} // namespace stereobm
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,538 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/limits.hpp"
namespace cv { namespace cuda { namespace device
{
namespace stereobp
{
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
__constant__ int cndisp;
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
{
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
}
///////////////////////////////////////////////////////////////
////////////////////////// comp data //////////////////////////
///////////////////////////////////////////////////////////////
template <int cn> struct PixDiff;
template <> struct PixDiff<1>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *ls;
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
return ::abs((int)l - *rs);
}
uchar l;
};
template <> struct PixDiff<3>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *((uchar3*)ls);
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
float val = tb * ::abs((int)l.x - rs[0]);
val += tg * ::abs((int)l.y - rs[1]);
val += tr * ::abs((int)l.z - rs[2]);
return val;
}
uchar3 l;
};
template <> struct PixDiff<4>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *((uchar4*)ls);
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
uchar4 r = *((uchar4*)rs);
float val = tb * ::abs((int)l.x - r.x);
val += tg * ::abs((int)l.y - r.y);
val += tr * ::abs((int)l.z - r.z);
return val;
}
uchar4 l;
};
template <int cn, typename D>
__global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
{
const uchar* ls = left.ptr(y) + x * cn;
const PixDiff<cn> pixDiff(ls);
const uchar* rs = right.ptr(y) + x * cn;
D* ds = data.ptr(y) + x;
const size_t disp_step = data.step * left.rows / sizeof(D);
for (int disp = 0; disp < cndisp; disp++)
{
if (x - disp >= 1)
{
float val = pixDiff(rs - disp * cn);
ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
}
else
{
ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
}
}
}
}
template<typename T, typename D>
void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////
//////////////////////// data step down ///////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst_cols && y < dst_rows)
{
for (int d = 0; d < cndisp; ++d)
{
float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
}
}
}
template<typename T>
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////// level up messages ////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst_cols && y < dst_rows)
{
const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
const size_t src_disp_step = src.step * src_rows / sizeof(T);
T* dstr = dst.ptr(y ) + x;
const T* srcr = src.ptr(y/2) + x/2;
for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
}
}
template <typename T>
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
int src_idx = (dst_idx + 1) & 1;
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step)
{
float prev = dst[0];
float cur;
for (int disp = 1; disp < cndisp; ++disp)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
prev = dst[(cndisp - 1) * step];
for (int disp = cndisp - 2; disp >= 0; disp--)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
}
template <typename T>
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
{
float minimum = device::numeric_limits<float>::max();
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = msg1[msg_disp_step * i];
dst_reg += msg2[msg_disp_step * i];
dst_reg += msg3[msg_disp_step * i];
dst_reg += data[data_disp_step * i];
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
}
calc_min_linear_penalty(dst, msg_disp_step);
minimum += cmax_disc_term;
float sum = 0;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = dst[msg_disp_step * i];
if (dst_reg > minimum)
{
dst_reg = minimum;
dst[msg_disp_step * i] = saturate_cast<T>(minimum);
}
sum += dst_reg;
}
sum /= cndisp;
for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum;
}
template <typename T>
__global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
{
T* us = u + y * elem_step + x;
T* ds = d + y * elem_step + x;
T* ls = l + y * elem_step + x;
T* rs = r + y * elem_step + x;
const T* dt = data.ptr(y) + x;
size_t msg_disp_step = elem_step * rows;
size_t data_disp_step = data.step * rows / sizeof(T);
message(us + elem_step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
message(ds - elem_step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
}
}
template <typename T>
void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(cols, threads.x << 1);
grid.y = divUp(rows, threads.y);
int elem_step = (int)(u.step / sizeof(T));
for(int t = 0; t < iters; ++t)
{
one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
PtrStepSz<short> disp)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
{
const T* us = u + (y + 1) * elem_step + x;
const T* ds = d + (y - 1) * elem_step + x;
const T* ls = l + y * elem_step + (x + 1);
const T* rs = r + y * elem_step+ (x - 1);
const T* dt = data + y * elem_step + x;
size_t disp_step = disp.rows * elem_step;
int best = 0;
float best_val = numeric_limits<float>::max();
for (int d = 0; d < cndisp; ++d)
{
float val = us[d * disp_step];
val += ds[d * disp_step];
val += ls[d * disp_step];
val += rs[d * disp_step];
val += dt[d * disp_step];
if (val < best_val)
{
best_val = val;
best = d;
}
}
disp.ptr(y)[x] = saturate_cast<short>(best);
}
}
template <typename T>
void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
const PtrStepSz<short>& disp, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
int elem_step = static_cast<int>(u.step/sizeof(T));
output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
} // namespace stereobp
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,864 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/functional.hpp"
namespace cv { namespace cuda { namespace device
{
namespace stereocsbp
{
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
__constant__ int cndisp;
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
__constant__ int cth;
__constant__ size_t cimg_step;
__constant__ size_t cmsg_step;
__constant__ size_t cdisp_step1;
__constant__ size_t cdisp_step2;
__constant__ uchar* cleft;
__constant__ uchar* cright;
__constant__ uchar* ctemp;
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
{
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) );
cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
}
///////////////////////////////////////////////////////////////
/////////////////////// init data cost ////////////////////////
///////////////////////////////////////////////////////////////
template <int channels> struct DataCostPerPixel;
template <> struct DataCostPerPixel<1>
{
static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
{
return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
}
};
template <> struct DataCostPerPixel<3>
{
static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
{
float tb = 0.114f * ::abs((int)left[0] - right[0]);
float tg = 0.587f * ::abs((int)left[1] - right[1]);
float tr = 0.299f * ::abs((int)left[2] - right[2]);
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
}
};
template <> struct DataCostPerPixel<4>
{
static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
{
uchar4 l = *((const uchar4*)left);
uchar4 r = *((const uchar4*)right);
float tb = 0.114f * ::abs((int)l.x - r.x);
float tg = 0.587f * ::abs((int)l.y - r.y);
float tr = 0.299f * ::abs((int)l.z - r.z);
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
}
};
template <typename T>
__global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < h && x < w)
{
T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
T* data_cost = (T*)ctemp + y * cmsg_step + x;
for(int i = 0; i < nr_plane; i++)
{
T minimum = device::numeric_limits<T>::max();
int id = 0;
for(int d = 0; d < cndisp; d++)
{
T cur = data_cost[d * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = d;
}
}
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost [id * cdisp_step1] = numeric_limits<T>::max();
}
}
}
template <typename T>
__global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < h && x < w)
{
T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
T* data_cost = (T*)ctemp + y * cmsg_step + x;
int nr_local_minimum = 0;
T prev = data_cost[0 * cdisp_step1];
T cur = data_cost[1 * cdisp_step1];
T next = data_cost[2 * cdisp_step1];
for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
{
if (cur < prev && cur < next)
{
data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
selected_disparity[nr_local_minimum * cdisp_step1] = d;
data_cost[d * cdisp_step1] = numeric_limits<T>::max();
nr_local_minimum++;
}
prev = cur;
cur = next;
next = data_cost[(d + 1) * cdisp_step1];
}
for (int i = nr_local_minimum; i < nr_plane; i++)
{
T minimum = numeric_limits<T>::max();
int id = 0;
for (int d = 0; d < cndisp; d++)
{
cur = data_cost[d * cdisp_step1];
if (cur < minimum)
{
minimum = cur;
id = d;
}
}
data_cost_selected[i * cdisp_step1] = minimum;
selected_disparity[i * cdisp_step1] = id;
data_cost[id * cdisp_step1] = numeric_limits<T>::max();
}
}
}
template <typename T, int channels>
__global__ void init_data_cost(int h, int w, int level)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < h && x < w)
{
int y0 = y << level;
int yt = (y + 1) << level;
int x0 = x << level;
int xt = (x + 1) << level;
T* data_cost = (T*)ctemp + y * cmsg_step + x;
for(int d = 0; d < cndisp; ++d)
{
float val = 0.0f;
for(int yi = y0; yi < yt; yi++)
{
for(int xi = x0; xi < xt; xi++)
{
int xr = xi - d;
if(d < cth || xr < 0)
val += cdata_weight * cmax_data_term;
else
{
const uchar* lle = cleft + yi * cimg_step + xi * channels;
const uchar* lri = cright + yi * cimg_step + xr * channels;
val += DataCostPerPixel<channels>::compute(lle, lri);
}
}
}
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
}
}
}
template <typename T, int winsz, int channels>
__global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
{
int x_out = blockIdx.x;
int y_out = blockIdx.y % h;
int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
int tid = threadIdx.x;
if (d < cndisp)
{
int x0 = x_out << level;
int y0 = y_out << level;
int len = ::min(y0 + winsz, rows) - y0;
float val = 0.0f;
if (x0 + tid < cols)
{
if (x0 + tid - d < 0 || d < cth)
val = cdata_weight * cmax_data_term * len;
else
{
const uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );
const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
for(int y = 0; y < len; ++y)
{
val += DataCostPerPixel<channels>::compute(lle, lri);
lle += cimg_step;
lri += cimg_step;
}
}
}
extern __shared__ float smem[];
reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
if (tid == 0)
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
}
}
template <typename T>
void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(w, threads.x);
grid.y = divUp(h, threads.y);
switch (channels)
{
case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
}
template <typename T, int winsz>
void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
{
const int threadsNum = 256;
const size_t smem_size = threadsNum * sizeof(float);
dim3 threads(winsz, 1, threadsNum / winsz);
dim3 grid(w, h, 1);
grid.y *= divUp(ndisp, threads.z);
switch (channels)
{
case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
}
template<class T>
void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
{
typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
static const InitDataCostCaller init_data_cost_callers[] =
{
init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
};
size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(w, threads.x);
grid.y = divUp(h, threads.y);
if (use_local_init_data_cost == true)
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
else
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
///////////////////////////////////////////////////////////////
////////////////////// compute data cost //////////////////////
///////////////////////////////////////////////////////////////
template <typename T, int channels>
__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < h && x < w)
{
int y0 = y << level;
int yt = (y + 1) << level;
int x0 = x << level;
int xt = (x + 1) << level;
const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
T* data_cost = data_cost_ + y * cmsg_step + x;
for(int d = 0; d < nr_plane; d++)
{
float val = 0.0f;
for(int yi = y0; yi < yt; yi++)
{
for(int xi = x0; xi < xt; xi++)
{
int sel_disp = selected_disparity[d * cdisp_step2];
int xr = xi - sel_disp;
if (xr < 0 || sel_disp < cth)
val += cdata_weight * cmax_data_term;
else
{
const uchar* left_x = cleft + yi * cimg_step + xi * channels;
const uchar* right_x = cright + yi * cimg_step + xr * channels;
val += DataCostPerPixel<channels>::compute(left_x, right_x);
}
}
}
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
}
}
}
template <typename T, int winsz, int channels>
__global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
{
int x_out = blockIdx.x;
int y_out = blockIdx.y % h;
int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
int tid = threadIdx.x;
const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
if (d < nr_plane)
{
int sel_disp = selected_disparity[d * cdisp_step2];
int x0 = x_out << level;
int y0 = y_out << level;
int len = ::min(y0 + winsz, rows) - y0;
float val = 0.0f;
if (x0 + tid < cols)
{
if (x0 + tid - sel_disp < 0 || sel_disp < cth)
val = cdata_weight * cmax_data_term * len;
else
{
const uchar* lle = cleft + y0 * cimg_step + channels * (x0 + tid );
const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
for(int y = 0; y < len; ++y)
{
val += DataCostPerPixel<channels>::compute(lle, lri);
lle += cimg_step;
lri += cimg_step;
}
}
}
extern __shared__ float smem[];
reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
if (tid == 0)
data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
}
}
template <typename T>
void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(w, threads.x);
grid.y = divUp(h, threads.y);
switch(channels)
{
case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
}
template <typename T, int winsz>
void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
{
const int threadsNum = 256;
const size_t smem_size = threadsNum * sizeof(float);
dim3 threads(winsz, 1, threadsNum / winsz);
dim3 grid(w, h, 1);
grid.y *= divUp(nr_plane, threads.z);
switch (channels)
{
case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
}
template<class T>
void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
{
typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
static const ComputeDataCostCaller callers[] =
{
compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
};
size_t disp_step1 = msg_step * h;
size_t disp_step2 = msg_step * h2;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
///////////////////////////////////////////////////////////////
//////////////////////// init message /////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
const T* data_cost_cur, const T* disparity_selected_cur,
int nr_plane, int nr_plane2)
{
for(int i = 0; i < nr_plane; i++)
{
T minimum = numeric_limits<T>::max();
int id = 0;
for(int j = 0; j < nr_plane2; j++)
{
T cur = data_cost_new[j * cdisp_step1];
if(cur < minimum)
{
minimum = cur;
id = j;
}
}
data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
}
}
template <typename T>
__global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
T* data_cost_selected_, const T* data_cost_,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < h && x < w)
{
const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
const T* d_cur = d_cur_ + ::max(0, y/2 - 1) * cmsg_step + x/2;
const T* l_cur = l_cur_ + (y/2) * cmsg_step + ::min(w2-1, x/2 + 1);
const T* r_cur = r_cur_ + (y/2) * cmsg_step + ::max(0, x/2 - 1);
T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
const T* data_cost = data_cost_ + y * cmsg_step + x;
for(int d = 0; d < nr_plane2; d++)
{
int idx2 = d * cdisp_step2;
T val = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
data_cost_new[d * cdisp_step1] = val;
}
T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
T* u_new = u_new_ + y * cmsg_step + x;
T* d_new = d_new_ + y * cmsg_step + x;
T* l_new = l_new_ + y * cmsg_step + x;
T* r_new = r_new_ + y * cmsg_step + x;
u_cur = u_cur_ + y/2 * cmsg_step + x/2;
d_cur = d_cur_ + y/2 * cmsg_step + x/2;
l_cur = l_cur_ + y/2 * cmsg_step + x/2;
r_cur = r_cur_ + y/2 * cmsg_step + x/2;
get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
data_cost_selected, disparity_selected_new, data_cost_new,
data_cost, disparity_selected_cur, nr_plane, nr_plane2);
}
}
template<class T>
void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
T* data_cost_selected, const T* data_cost, size_t msg_step,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
{
size_t disp_step1 = msg_step * h;
size_t disp_step2 = msg_step * h2;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(w, threads.x);
grid.y = divUp(h, threads.y);
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
u_cur, d_cur, l_cur, r_cur,
selected_disp_pyr_new, selected_disp_pyr_cur,
data_cost_selected, data_cost,
h, w, nr_plane, h2, w2, nr_plane2);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
short* data_cost_selected, const short* data_cost, size_t msg_step,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
float* data_cost_selected, const float* data_cost, size_t msg_step,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
{
T minimum = numeric_limits<T>::max();
for(int d = 0; d < nr_plane; d++)
{
int idx = d * cdisp_step1;
T val = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
if(val < minimum)
minimum = val;
msg_dst[idx] = val;
}
float sum = 0;
for(int d = 0; d < nr_plane; d++)
{
float cost_min = minimum + cmax_disc_term;
T src_disp_reg = src_disp[d * cdisp_step1];
for(int d2 = 0; d2 < nr_plane; d2++)
cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
sum += cost_min;
}
sum /= nr_plane;
for(int d = 0; d < nr_plane; d++)
msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
}
template <typename T>
__global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
{
int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
const T* data = data_cost_selected + y * cmsg_step + x;
T* u = u_ + y * cmsg_step + x;
T* d = d_ + y * cmsg_step + x;
T* l = l_ + y * cmsg_step + x;
T* r = r_ + y * cmsg_step + x;
const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
T* temp = (T*)ctemp + y * cmsg_step + x;
message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
}
}
template<class T>
void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
{
size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(w, threads.x << 1);
grid.y = divUp(h, threads.y);
for(int t = 0; t < iters; ++t)
{
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
int h, int w, int nr_plane, int iters, cudaStream_t stream);
template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
int h, int w, int nr_plane, int iters, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
const T* data_cost_selected, const T* disp_selected_pyr,
PtrStepSz<short> disp, int nr_plane)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
{
const T* data = data_cost_selected + y * cmsg_step + x;
const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
const T* u = u_ + (y+1) * cmsg_step + (x+0);
const T* d = d_ + (y-1) * cmsg_step + (x+0);
const T* l = l_ + (y+0) * cmsg_step + (x+1);
const T* r = r_ + (y+0) * cmsg_step + (x-1);
int best = 0;
T best_val = numeric_limits<T>::max();
for (int i = 0; i < nr_plane; ++i)
{
int idx = i * cdisp_step1;
T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
if (val < best_val)
{
best_val = val;
best = saturate_cast<short>(disp_selected[idx]);
}
}
disp(y, x) = best;
}
}
template<class T>
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
{
size_t disp_step = disp.rows * msg_step;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
} // namespace stereocsbp
}}} // namespace cv { namespace cuda { namespace cudev {
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,235 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
namespace cv { namespace cuda { namespace device
{
/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
__constant__ float cq[16];
template <typename T, typename D>
__global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= disp.rows || x >= disp.cols)
return;
const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
const float qw = x * cq[12] + y * cq[13] + cq[15];
const T d = disp(y, x);
const float iW = 1.f / (qw + cq[14] * d);
D v = VecTraits<D>::all(1.0f);
v.x = (qx + cq[2] * d) * iW;
v.y = (qy + cq[6] * d) * iW;
v.z = (qz + cq[10] * d) * iW;
xyz(y, x) = v;
}
template <typename T, typename D>
void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
/////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
template <typename T>
__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
{
unsigned int H = ((ndisp-d) * 240)/ndisp;
unsigned int hi = (H/60) % 6;
float f = H/60.f - H/60;
float p = V * (1 - S);
float q = V * (1 - f * S);
float t = V * (1 - (1 - f) * S);
float3 res;
if (hi == 0) //R = V, G = t, B = p
{
res.x = p;
res.y = t;
res.z = V;
}
if (hi == 1) // R = q, G = V, B = p
{
res.x = p;
res.y = V;
res.z = q;
}
if (hi == 2) // R = p, G = V, B = t
{
res.x = t;
res.y = V;
res.z = p;
}
if (hi == 3) // R = p, G = q, B = V
{
res.x = V;
res.y = q;
res.z = p;
}
if (hi == 4) // R = t, G = p, B = V
{
res.x = V;
res.y = p;
res.z = t;
}
if (hi == 5) // R = V, G = p, B = q
{
res.x = q;
res.y = p;
res.z = V;
}
const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
const unsigned int a = 255U;
return (a << 24) + (r << 16) + (g << 8) + b;
}
__global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
{
const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
{
uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
uint4 res;
res.x = cvtPixel(d4.x, ndisp);
res.y = cvtPixel(d4.y, ndisp);
res.z = cvtPixel(d4.z, ndisp);
res.w = cvtPixel(d4.w, ndisp);
uint4* line = (uint4*)(out_image + y * out_step);
line[x >> 2] = res;
}
}
__global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
{
const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
{
short2 d2 = *(short2*)(disp + y * disp_step + x);
uint2 res;
res.x = cvtPixel(d2.x, ndisp);
res.y = cvtPixel(d2.y, ndisp);
uint2* line = (uint2*)(out_image + y * out_step);
line[x >> 1] = res;
}
}
void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
{
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x << 2);
grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x << 1);
grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,206 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::cuda;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int, int, int) { throw_no_cuda(); return Ptr<cuda::DisparityBilateralFilter>(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace cuda { namespace device
{
namespace disp_bilateral_filter
{
void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
template<typename T>
void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
}
}}}
namespace
{
class DispBilateralFilterImpl : public cuda::DisparityBilateralFilter
{
public:
DispBilateralFilterImpl(int ndisp, int radius, int iters);
void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream);
int getNumDisparities() const { return ndisp_; }
void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
int getRadius() const { return radius_; }
void setRadius(int radius);
int getNumIters() const { return iters_; }
void setNumIters(int iters) { iters_ = iters; }
double getEdgeThreshold() const { return edge_threshold_; }
void setEdgeThreshold(double edge_threshold) { edge_threshold_ = (float) edge_threshold; }
double getMaxDiscThreshold() const { return max_disc_threshold_; }
void setMaxDiscThreshold(double max_disc_threshold) { max_disc_threshold_ = (float) max_disc_threshold; }
double getSigmaRange() const { return sigma_range_; }
void setSigmaRange(double sigma_range);
private:
int ndisp_;
int radius_;
int iters_;
float edge_threshold_;
float max_disc_threshold_;
float sigma_range_;
GpuMat table_color_;
GpuMat table_space_;
};
void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
{
Mat cpu_table_color(1, len, CV_32F);
float* line = cpu_table_color.ptr<float>();
for(int i = 0; i < len; i++)
line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
table_color.upload(cpu_table_color);
}
void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
{
int half = (win_size >> 1);
Mat cpu_table_space(half + 1, half + 1, CV_32F);
for (int y = 0; y <= half; ++y)
{
float* row = cpu_table_space.ptr<float>(y);
for (int x = 0; x <= half; ++x)
row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
}
table_space.upload(cpu_table_space);
}
const float DEFAULT_EDGE_THRESHOLD = 0.1f;
const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
const float DEFAULT_SIGMA_RANGE = 10.0f;
DispBilateralFilterImpl::DispBilateralFilterImpl(int ndisp, int radius, int iters) :
ndisp_(ndisp), radius_(radius), iters_(iters),
edge_threshold_(DEFAULT_EDGE_THRESHOLD), max_disc_threshold_(DEFAULT_MAX_DISC_THRESHOLD),
sigma_range_(DEFAULT_SIGMA_RANGE)
{
calc_color_weighted_table(table_color_, sigma_range_, 255);
calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
}
void DispBilateralFilterImpl::setRadius(int radius)
{
radius_ = radius;
calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
}
void DispBilateralFilterImpl::setSigmaRange(double sigma_range)
{
sigma_range_ = (float) sigma_range;
calc_color_weighted_table(table_color_, sigma_range_, 255);
}
template <typename T>
void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
GpuMat& table_color, GpuMat& table_space,
const GpuMat& disp, const GpuMat& img,
OutputArray _dst, Stream& stream)
{
using namespace cv::cuda::device::disp_bilateral_filter;
const short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
const short max_disc = short(ndisp * max_disc_threshold + 0.5);
disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
_dst.create(disp.size(), disp.type());
GpuMat dst = _dst.getGpuMat();
if (dst.data != disp.data)
disp.copyTo(dst, stream);
disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
}
void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
{
typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
GpuMat& table_color, GpuMat& table_space,
const GpuMat& disp, const GpuMat& img, OutputArray dst, Stream& stream);
const bilateral_filter_operator_t operators[] =
{disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
CV_Assert( 0 < ndisp_ && 0 < radius_ && 0 < iters_ );
GpuMat disp = _disp.getGpuMat();
GpuMat img = _image.getGpuMat();
CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 );
CV_Assert( disp.size() == img.size() );
operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
table_color_, table_space_, disp, img, dst, stream);
}
}
Ptr<cuda::DisparityBilateralFilter> cv::cuda::createDisparityBilateralFilter(int ndisp, int radius, int iters)
{
return new DispBilateralFilterImpl(ndisp, radius, iters);
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -0,0 +1,43 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"

View File

@@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_PRECOMP_H__
#define __OPENCV_PRECOMP_H__
#include <limits>
#include "opencv2/cudastereo.hpp"
#include "opencv2/core/private.cuda.hpp"
#include "opencv2/core/utility.hpp"
#endif /* __OPENCV_PRECOMP_H__ */

View File

@@ -0,0 +1,185 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::cuda;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
Ptr<cuda::StereoBM> cv::cuda::createStereoBM(int, int) { throw_no_cuda(); return Ptr<cuda::StereoBM>(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace cuda { namespace device
{
namespace stereobm
{
void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int ndisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t & stream);
void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream);
}
}}}
namespace
{
class StereoBMImpl : public cuda::StereoBM
{
public:
StereoBMImpl(int numDisparities, int blockSize);
void compute(InputArray left, InputArray right, OutputArray disparity);
void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
int getMinDisparity() const { return 0; }
void setMinDisparity(int /*minDisparity*/) {}
int getNumDisparities() const { return ndisp_; }
void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
int getBlockSize() const { return winSize_; }
void setBlockSize(int blockSize) { winSize_ = blockSize; }
int getSpeckleWindowSize() const { return 0; }
void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
int getSpeckleRange() const { return 0; }
void setSpeckleRange(int /*speckleRange*/) {}
int getDisp12MaxDiff() const { return 0; }
void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
int getPreFilterType() const { return preset_; }
void setPreFilterType(int preFilterType) { preset_ = preFilterType; }
int getPreFilterSize() const { return 0; }
void setPreFilterSize(int /*preFilterSize*/) {}
int getPreFilterCap() const { return preFilterCap_; }
void setPreFilterCap(int preFilterCap) { preFilterCap_ = preFilterCap; }
int getTextureThreshold() const { return static_cast<int>(avergeTexThreshold_); }
void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = static_cast<float>(textureThreshold); }
int getUniquenessRatio() const { return 0; }
void setUniquenessRatio(int /*uniquenessRatio*/) {}
int getSmallerBlockSize() const { return 0; }
void setSmallerBlockSize(int /*blockSize*/){}
Rect getROI1() const { return Rect(); }
void setROI1(Rect /*roi1*/) {}
Rect getROI2() const { return Rect(); }
void setROI2(Rect /*roi2*/) {}
private:
int preset_;
int ndisp_;
int winSize_;
int preFilterCap_;
float avergeTexThreshold_;
GpuMat minSSD_, leBuf_, riBuf_;
};
StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
: preset_(0), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3)
{
}
void StereoBMImpl::compute(InputArray left, InputArray right, OutputArray disparity)
{
compute(left, right, disparity, Stream::Null());
}
void StereoBMImpl::compute(InputArray _left, InputArray _right, OutputArray _disparity, Stream& _stream)
{
using namespace ::cv::cuda::device::stereobm;
const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
CV_Assert( 0 < ndisp_ && ndisp_ <= max_supported_ndisp );
CV_Assert( ndisp_ % 8 == 0 );
CV_Assert( winSize_ % 2 == 1 );
GpuMat left = _left.getGpuMat();
GpuMat right = _right.getGpuMat();
CV_Assert( left.type() == CV_8UC1 );
CV_Assert( left.size() == right.size() && left.type() == right.type() );
_disparity.create(left.size(), CV_8UC1);
GpuMat disparity = _disparity.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
cuda::ensureSizeIsEnough(left.size(), CV_32SC1, minSSD_);
PtrStepSzb le_for_bm = left;
PtrStepSzb ri_for_bm = right;
if (preset_ == cv::StereoBM::PREFILTER_XSOBEL)
{
cuda::ensureSizeIsEnough(left.size(), left.type(), leBuf_);
cuda::ensureSizeIsEnough(right.size(), right.type(), riBuf_);
prefilter_xsobel( left, leBuf_, preFilterCap_, stream);
prefilter_xsobel(right, riBuf_, preFilterCap_, stream);
le_for_bm = leBuf_;
ri_for_bm = riBuf_;
}
stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
if (avergeTexThreshold_ > 0)
postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
}
}
Ptr<cuda::StereoBM> cv::cuda::createStereoBM(int numDisparities, int blockSize)
{
return new StereoBMImpl(numDisparities, blockSize);
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -0,0 +1,380 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::cuda;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::cuda::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, int&, int&) { throw_no_cuda(); }
Ptr<cuda::StereoBeliefPropagation> cv::cuda::createStereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); return Ptr<cuda::StereoBeliefPropagation>(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace cuda { namespace device
{
namespace stereobp
{
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
template<typename T, typename D>
void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
template<typename T>
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
template <typename T>
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
template <typename T>
void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
template <typename T>
void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
const PtrStepSz<short>& disp, cudaStream_t stream);
}
}}}
namespace
{
class StereoBPImpl : public cuda::StereoBeliefPropagation
{
public:
StereoBPImpl(int ndisp, int iters, int levels, int msg_type);
void compute(InputArray left, InputArray right, OutputArray disparity);
void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
void compute(InputArray data, OutputArray disparity, Stream& stream);
int getMinDisparity() const { return 0; }
void setMinDisparity(int /*minDisparity*/) {}
int getNumDisparities() const { return ndisp_; }
void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
int getBlockSize() const { return 0; }
void setBlockSize(int /*blockSize*/) {}
int getSpeckleWindowSize() const { return 0; }
void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
int getSpeckleRange() const { return 0; }
void setSpeckleRange(int /*speckleRange*/) {}
int getDisp12MaxDiff() const { return 0; }
void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
int getNumIters() const { return iters_; }
void setNumIters(int iters) { iters_ = iters; }
int getNumLevels() const { return levels_; }
void setNumLevels(int levels) { levels_ = levels; }
double getMaxDataTerm() const { return max_data_term_; }
void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
double getDataWeight() const { return data_weight_; }
void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
double getMaxDiscTerm() const { return max_disc_term_; }
void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
double getDiscSingleJump() const { return disc_single_jump_; }
void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
int getMsgType() const { return msg_type_; }
void setMsgType(int msg_type) { msg_type_ = msg_type; }
private:
void init(Stream& stream);
void calcBP(OutputArray disp, Stream& stream);
int ndisp_;
int iters_;
int levels_;
float max_data_term_;
float data_weight_;
float max_disc_term_;
float disc_single_jump_;
int msg_type_;
float scale_;
int rows_, cols_;
std::vector<int> cols_all_, rows_all_;
GpuMat u_, d_, l_, r_, u2_, d2_, l2_, r2_;
std::vector<GpuMat> datas_;
GpuMat outBuf_;
};
const float DEFAULT_MAX_DATA_TERM = 10.0f;
const float DEFAULT_DATA_WEIGHT = 0.07f;
const float DEFAULT_MAX_DISC_TERM = 1.7f;
const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
StereoBPImpl::StereoBPImpl(int ndisp, int iters, int levels, int msg_type) :
ndisp_(ndisp), iters_(iters), levels_(levels),
max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
msg_type_(msg_type)
{
}
void StereoBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
{
compute(left, right, disparity, Stream::Null());
}
void StereoBPImpl::compute(InputArray _left, InputArray _right, OutputArray disparity, Stream& stream)
{
using namespace cv::cuda::device::stereobp;
typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
static const comp_data_t comp_data_callers[2][5] =
{
{0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
{0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
};
scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
GpuMat left = _left.getGpuMat();
GpuMat right = _right.getGpuMat();
CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
CV_Assert( left.size() == right.size() && left.type() == right.type() );
rows_ = left.rows;
cols_ = left.cols;
const int divisor = (int) pow(2.f, levels_ - 1.0f);
const int lowest_cols = cols_ / divisor;
const int lowest_rows = rows_ / divisor;
const int min_image_dim_size = 2;
CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
init(stream);
datas_[0].create(rows_ * ndisp_, cols_, msg_type_);
comp_data_callers[msg_type_ == CV_32F][left.channels()](left, right, datas_[0], StreamAccessor::getStream(stream));
calcBP(disparity, stream);
}
void StereoBPImpl::compute(InputArray _data, OutputArray disparity, Stream& stream)
{
scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
GpuMat data = _data.getGpuMat();
CV_Assert( (data.type() == msg_type_) && (data.rows % ndisp_ == 0) );
rows_ = data.rows / ndisp_;
cols_ = data.cols;
const int divisor = (int) pow(2.f, levels_ - 1.0f);
const int lowest_cols = cols_ / divisor;
const int lowest_rows = rows_ / divisor;
const int min_image_dim_size = 2;
CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
init(stream);
data.copyTo(datas_[0], stream);
calcBP(disparity, stream);
}
void StereoBPImpl::init(Stream& stream)
{
using namespace cv::cuda::device::stereobp;
u_.create(rows_ * ndisp_, cols_, msg_type_);
d_.create(rows_ * ndisp_, cols_, msg_type_);
l_.create(rows_ * ndisp_, cols_, msg_type_);
r_.create(rows_ * ndisp_, cols_, msg_type_);
if (levels_ & 1)
{
//can clear less area
u_.setTo(0, stream);
d_.setTo(0, stream);
l_.setTo(0, stream);
r_.setTo(0, stream);
}
if (levels_ > 1)
{
int less_rows = (rows_ + 1) / 2;
int less_cols = (cols_ + 1) / 2;
u2_.create(less_rows * ndisp_, less_cols, msg_type_);
d2_.create(less_rows * ndisp_, less_cols, msg_type_);
l2_.create(less_rows * ndisp_, less_cols, msg_type_);
r2_.create(less_rows * ndisp_, less_cols, msg_type_);
if ((levels_ & 1) == 0)
{
u2_.setTo(0, stream);
d2_.setTo(0, stream);
l2_.setTo(0, stream);
r2_.setTo(0, stream);
}
}
load_constants(ndisp_, max_data_term_, scale_ * data_weight_, scale_ * max_disc_term_, scale_ * disc_single_jump_);
datas_.resize(levels_);
cols_all_.resize(levels_);
rows_all_.resize(levels_);
cols_all_[0] = cols_;
rows_all_[0] = rows_;
}
void StereoBPImpl::calcBP(OutputArray disp, Stream& _stream)
{
using namespace cv::cuda::device::stereobp;
typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
static const data_step_down_t data_step_down_callers[2] =
{
data_step_down_gpu<short>, data_step_down_gpu<float>
};
typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
static const level_up_messages_t level_up_messages_callers[2] =
{
level_up_messages_gpu<short>, level_up_messages_gpu<float>
};
typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
static const calc_all_iterations_t calc_all_iterations_callers[2] =
{
calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
};
typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
static const output_t output_callers[2] =
{
output_gpu<short>, output_gpu<float>
};
const int funcIdx = msg_type_ == CV_32F;
cudaStream_t stream = StreamAccessor::getStream(_stream);
for (int i = 1; i < levels_; ++i)
{
cols_all_[i] = (cols_all_[i-1] + 1) / 2;
rows_all_[i] = (rows_all_[i-1] + 1) / 2;
datas_[i].create(rows_all_[i] * ndisp_, cols_all_[i], msg_type_);
data_step_down_callers[funcIdx](cols_all_[i], rows_all_[i], rows_all_[i-1], datas_[i-1], datas_[i], stream);
}
PtrStepSzb mus[] = {u_, u2_};
PtrStepSzb mds[] = {d_, d2_};
PtrStepSzb mrs[] = {r_, r2_};
PtrStepSzb mls[] = {l_, l2_};
int mem_idx = (levels_ & 1) ? 0 : 1;
for (int i = levels_ - 1; i >= 0; --i)
{
// for lower level we have already computed messages by setting to zero
if (i != levels_ - 1)
level_up_messages_callers[funcIdx](mem_idx, cols_all_[i], rows_all_[i], rows_all_[i+1], mus, mds, mls, mrs, stream);
calc_all_iterations_callers[funcIdx](cols_all_[i], rows_all_[i], iters_, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas_[i], stream);
mem_idx = (mem_idx + 1) & 1;
}
const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
disp.create(rows_, cols_, dtype);
GpuMat out = disp.getGpuMat();
if (dtype != CV_16SC1)
{
outBuf_.create(rows_, cols_, CV_16SC1);
out = outBuf_;
}
out.setTo(0, _stream);
output_callers[funcIdx](u_, d_, l_, r_, datas_.front(), out, stream);
if (dtype != CV_16SC1)
out.convertTo(disp, dtype, _stream);
}
}
Ptr<cuda::StereoBeliefPropagation> cv::cuda::createStereoBeliefPropagation(int ndisp, int iters, int levels, int msg_type)
{
return new StereoBPImpl(ndisp, iters, levels, msg_type);
}
void cv::cuda::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
{
ndisp = width / 4;
if ((ndisp & 1) != 0)
ndisp++;
int mm = std::max(width, height);
iters = mm / 100 + 2;
levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
if (levels == 0) levels++;
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -0,0 +1,387 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::cuda;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::cuda::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, int&, int&, int&) { throw_no_cuda(); }
Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); return Ptr<cuda::StereoConstantSpaceBP>(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace cuda { namespace device
{
namespace stereocsbp
{
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp);
template<class T>
void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
template<class T>
void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
template<class T>
void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
T* data_cost_selected, const T* data_cost, size_t msg_step,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
template<class T>
void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
template<class T>
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
}
}}}
namespace
{
class StereoCSBPImpl : public cuda::StereoConstantSpaceBP
{
public:
StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type);
void compute(InputArray left, InputArray right, OutputArray disparity);
void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
void compute(InputArray data, OutputArray disparity, Stream& stream);
int getMinDisparity() const { return min_disp_th_; }
void setMinDisparity(int minDisparity) { min_disp_th_ = minDisparity; }
int getNumDisparities() const { return ndisp_; }
void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
int getBlockSize() const { return 0; }
void setBlockSize(int /*blockSize*/) {}
int getSpeckleWindowSize() const { return 0; }
void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
int getSpeckleRange() const { return 0; }
void setSpeckleRange(int /*speckleRange*/) {}
int getDisp12MaxDiff() const { return 0; }
void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
int getNumIters() const { return iters_; }
void setNumIters(int iters) { iters_ = iters; }
int getNumLevels() const { return levels_; }
void setNumLevels(int levels) { levels_ = levels; }
double getMaxDataTerm() const { return max_data_term_; }
void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
double getDataWeight() const { return data_weight_; }
void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
double getMaxDiscTerm() const { return max_disc_term_; }
void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
double getDiscSingleJump() const { return disc_single_jump_; }
void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
int getMsgType() const { return msg_type_; }
void setMsgType(int msg_type) { msg_type_ = msg_type; }
int getNrPlane() const { return nr_plane_; }
void setNrPlane(int nr_plane) { nr_plane_ = nr_plane; }
bool getUseLocalInitDataCost() const { return use_local_init_data_cost_; }
void setUseLocalInitDataCost(bool use_local_init_data_cost) { use_local_init_data_cost_ = use_local_init_data_cost; }
private:
int min_disp_th_;
int ndisp_;
int iters_;
int levels_;
float max_data_term_;
float data_weight_;
float max_disc_term_;
float disc_single_jump_;
int msg_type_;
int nr_plane_;
bool use_local_init_data_cost_;
GpuMat mbuf_;
GpuMat temp_;
GpuMat outBuf_;
};
const float DEFAULT_MAX_DATA_TERM = 30.0f;
const float DEFAULT_DATA_WEIGHT = 1.0f;
const float DEFAULT_MAX_DISC_TERM = 160.0f;
const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
StereoCSBPImpl::StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type) :
min_disp_th_(0), ndisp_(ndisp), iters_(iters), levels_(levels),
max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
msg_type_(msg_type), nr_plane_(nr_plane), use_local_init_data_cost_(true)
{
}
void StereoCSBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
{
compute(left, right, disparity, Stream::Null());
}
void StereoCSBPImpl::compute(InputArray _left, InputArray _right, OutputArray disp, Stream& _stream)
{
using namespace cv::cuda::device::stereocsbp;
CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ && 0 < nr_plane_ && levels_ <= 8 );
GpuMat left = _left.getGpuMat();
GpuMat right = _right.getGpuMat();
CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
CV_Assert( left.size() == right.size() && left.type() == right.type() );
cudaStream_t stream = StreamAccessor::getStream(_stream);
////////////////////////////////////////////////////////////////////////////////////////////
// Init
int rows = left.rows;
int cols = left.cols;
levels_ = std::min(levels_, int(log((double)ndisp_) / log(2.0)));
// compute sizes
AutoBuffer<int> buf(levels_ * 3);
int* cols_pyr = buf;
int* rows_pyr = cols_pyr + levels_;
int* nr_plane_pyr = rows_pyr + levels_;
cols_pyr[0] = cols;
rows_pyr[0] = rows;
nr_plane_pyr[0] = nr_plane_;
for (int i = 1; i < levels_; i++)
{
cols_pyr[i] = cols_pyr[i-1] / 2;
rows_pyr[i] = rows_pyr[i-1] / 2;
nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
}
GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
//allocate buffers
int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
buffers_count += 2; // data_cost has twice more rows than other buffers, what's why +2, not +1;
buffers_count += 1; // data_cost_selected
mbuf_.create(rows * nr_plane_ * buffers_count, cols, msg_type_);
data_cost = mbuf_.rowRange(0, rows * nr_plane_ * 2);
data_cost_selected = mbuf_.rowRange(data_cost.rows, data_cost.rows + rows * nr_plane_);
for(int k = 0; k < 2; ++k) // in/out
{
GpuMat sub1 = mbuf_.rowRange(data_cost.rows + data_cost_selected.rows, mbuf_.rows);
GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
for(int _r = 0; _r < 5; ++_r)
{
*buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
CV_DbgAssert( buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * nr_plane_ );
}
};
size_t elem_step = mbuf_.step / mbuf_.elemSize();
Size temp_size = data_cost.size();
if ((size_t)temp_size.area() < elem_step * rows_pyr[levels_ - 1] * ndisp_)
temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels_ - 1] * ndisp_);
temp_.create(temp_size, msg_type_);
////////////////////////////////////////////////////////////////////////////
// Compute
load_constants(ndisp_, max_data_term_, data_weight_, max_disc_term_, disc_single_jump_, min_disp_th_, left, right, temp_);
l[0].setTo(0, _stream);
d[0].setTo(0, _stream);
r[0].setTo(0, _stream);
u[0].setTo(0, _stream);
l[1].setTo(0, _stream);
d[1].setTo(0, _stream);
r[1].setTo(0, _stream);
u[1].setTo(0, _stream);
data_cost.setTo(0, _stream);
data_cost_selected.setTo(0, _stream);
int cur_idx = 0;
if (msg_type_ == CV_32F)
{
for (int i = levels_ - 1; i >= 0; i--)
{
if (i == levels_ - 1)
{
init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
}
else
{
compute_data_cost(disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
int new_idx = (cur_idx + 1) & 1;
init_message(u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
disp_selected_pyr[new_idx].ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(),
data_cost_selected.ptr<float>(), data_cost.ptr<float>(), elem_step, rows_pyr[i],
cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
cur_idx = new_idx;
}
calc_all_iterations(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step,
rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
}
}
else
{
for (int i = levels_ - 1; i >= 0; i--)
{
if (i == levels_ - 1)
{
init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
}
else
{
compute_data_cost(disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
int new_idx = (cur_idx + 1) & 1;
init_message(u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
disp_selected_pyr[new_idx].ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(),
data_cost_selected.ptr<short>(), data_cost.ptr<short>(), elem_step, rows_pyr[i],
cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
cur_idx = new_idx;
}
calc_all_iterations(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step,
rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
}
}
const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
disp.create(rows, cols, dtype);
GpuMat out = disp.getGpuMat();
if (dtype != CV_16SC1)
{
outBuf_.create(rows, cols, CV_16SC1);
out = outBuf_;
}
out.setTo(0, _stream);
if (msg_type_ == CV_32F)
{
compute_disp(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step, out, nr_plane_pyr[0], stream);
}
else
{
compute_disp(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step, out, nr_plane_pyr[0], stream);
}
if (dtype != CV_16SC1)
out.convertTo(disp, dtype, _stream);
}
void StereoCSBPImpl::compute(InputArray /*data*/, OutputArray /*disparity*/, Stream& /*stream*/)
{
CV_Error(Error::StsNotImplemented, "Not implemented");
}
}
Ptr<cuda::StereoConstantSpaceBP> cv::cuda::createStereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, int msg_type)
{
return new StereoCSBPImpl(ndisp, iters, levels, nr_plane, msg_type);
}
void cv::cuda::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
{
ndisp = (int) ((float) width / 3.14f);
if ((ndisp & 1) != 0)
ndisp++;
int mm = std::max(width, height);
iters = mm / 100 + ((mm > 1200)? - 4 : 4);
levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
if (levels == 0) levels++;
nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -0,0 +1,123 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::cuda;
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::cuda::reprojectImageTo3D(InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
void cv::cuda::drawColorDisp(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
#else
////////////////////////////////////////////////////////////////////////
// reprojectImageTo3D
namespace cv { namespace cuda { namespace device
{
template <typename T, typename D>
void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
}}}
void cv::cuda::reprojectImageTo3D(InputArray _disp, OutputArray _xyz, InputArray _Q, int dst_cn, Stream& stream)
{
using namespace cv::cuda::device;
typedef void (*func_t)(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
static const func_t funcs[2][4] =
{
{reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
{reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
};
GpuMat disp = _disp.getGpuMat();
Mat Q = _Q.getMat();
CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
CV_Assert( Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous() );
CV_Assert( dst_cn == 3 || dst_cn == 4 );
_xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
GpuMat xyz = _xyz.getGpuMat();
funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// drawColorDisp
namespace cv { namespace cuda { namespace device
{
void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream);
}}}
namespace
{
template <typename T>
void drawColorDisp_caller(const GpuMat& src, OutputArray _dst, int ndisp, const cudaStream_t& stream)
{
using namespace ::cv::cuda::device;
_dst.create(src.size(), CV_8UC4);
GpuMat dst = _dst.getGpuMat();
drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
}
}
void cv::cuda::drawColorDisp(InputArray _src, OutputArray dst, int ndisp, Stream& stream)
{
typedef void (*drawColorDisp_caller_t)(const GpuMat& src, OutputArray dst, int ndisp, const cudaStream_t& stream);
const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
GpuMat src = _src.getGpuMat();
CV_Assert( src.type() == CV_8U || src.type() == CV_16S );
drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
}
#endif