From 17f7b12a83b6c35e62b88827311a371bd36c6aa4 Mon Sep 17 00:00:00 2001 From: Anatoly Baksheev Date: Thu, 22 Jul 2010 15:32:03 +0000 Subject: [PATCH] Variable winSize for StereoBP_GPU Fixed StereoBM_GPU kernel crash Textureness threshold added --- modules/gpu/include/opencv2/gpu/gpu.hpp | 8 +- modules/gpu/src/cuda/cuda_shared.hpp | 2 +- modules/gpu/src/cuda/stereobm.cu | 309 ++++++++++++++++++------ modules/gpu/src/stereobm_gpu.cpp | 31 ++- 4 files changed, 268 insertions(+), 82 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index d36404355..ca6c9ad8d 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -335,6 +335,7 @@ namespace cv //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size //! ndisparities should be multiple of 8. SSD WindowsSize is fixed to 19 now StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ); + //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair //! Output disparity has CV_8U type. void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity); @@ -350,9 +351,14 @@ namespace cv int ndisp; int winSize; int preset; + + // If avergeTexThreshold == 0 => post procesing is disabled + // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image + // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold + // i.e. input left image is low textured. + float avergeTexThreshold; private: GpuMat minSSD, leBuf, riBuf; - }; } } diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index 217416648..0b6a63b3d 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -80,7 +80,7 @@ namespace cv static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") { if( cudaSuccess != err) - cv::gpu::error(cudaGetErrorString(err), __FILE__, __LINE__, func); + cv::gpu::error(cudaGetErrorString(err), file, line, func); } #endif /* __OPENCV_CUDA_SHARED_HPP__ */ diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu index e2427da36..d53d81a3c 100644 --- a/modules/gpu/src/cuda/stereobm.cu +++ b/modules/gpu/src/cuda/stereobm.cu @@ -42,21 +42,22 @@ #include "cuda_shared.hpp" +using namespace cv::gpu; + +////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////// Streeo BM //////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + #define ROWSperTHREAD 21 // the number of rows a thread will process + +namespace stereobm_gpu +{ + #define BLOCK_W 128 // the thread block width (464) #define N_DISPARITIES 8 #define STEREO_MIND 0 // The minimum d range to check #define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing -#define RADIUS 9 // Kernel Radius 5V & 5H = 11x11 kernel - -#define WINSZ (2 * RADIUS + 1) -#define N_DIRTY_PIXELS (2 * RADIUS) -#define COL_SSD_SIZE (BLOCK_W + N_DIRTY_PIXELS) -#define SHARED_MEM_SIZE (COL_SSD_SIZE) // amount of shared memory used - -namespace stereobm_gpu -{ __constant__ unsigned int* cminSSDImage; __constant__ size_t cminSSD_step; @@ -68,6 +69,7 @@ __device__ int SQ(int a) return a * a; } +template __device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd) { unsigned int cache = 0; @@ -83,24 +85,26 @@ __device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_s if (threadIdx.x < BLOCK_W - RADIUS) cache2 = col_ssd_cache[RADIUS]; else - for(int i = RADIUS + 1; i < WINSZ; i++) + for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++) cache2 += col_ssd[i]; return col_ssd[0] + cache + cache2; } +template __device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd) { unsigned int ssd[N_DISPARITIES]; - ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * SHARED_MEM_SIZE); - ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * SHARED_MEM_SIZE); - ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * SHARED_MEM_SIZE); - ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * SHARED_MEM_SIZE); - ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * SHARED_MEM_SIZE); - ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * SHARED_MEM_SIZE); - ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * SHARED_MEM_SIZE); - ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * SHARED_MEM_SIZE); + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS)); + ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS)); + ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS)); + ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS)); + ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS)); + ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS)); + ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS)); + ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS)); int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); @@ -114,6 +118,7 @@ __device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd) return make_uint2(mssd, bestIdx); } +template __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, unsigned int *col_ssd) { unsigned char leftPixel1; @@ -146,47 +151,48 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha rightPixel2[5] = imageR[idx2 - 5]; rightPixel2[6] = imageR[idx2 - 6]; - + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) diff1 = leftPixel1 - rightPixel1[0]; diff2 = leftPixel2 - rightPixel2[0]; - col_ssd[0 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[1]; diff2 = leftPixel2 - rightPixel2[1]; - col_ssd[1 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[2]; diff2 = leftPixel2 - rightPixel2[2]; - col_ssd[2 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[3]; diff2 = leftPixel2 - rightPixel2[3]; - col_ssd[3 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[4]; diff2 = leftPixel2 - rightPixel2[4]; - col_ssd[4 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[5]; diff2 = leftPixel2 - rightPixel2[5]; - col_ssd[5 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[6]; diff2 = leftPixel2 - rightPixel2[6]; - col_ssd[6 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); diff1 = leftPixel1 - rightPixel1[7]; diff2 = leftPixel2 - rightPixel2[7]; - col_ssd[7 * SHARED_MEM_SIZE] += SQ(diff2) - SQ(diff1); + col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1); } +template __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, unsigned int *col_ssd) { unsigned char leftPixel1; int idx; unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0}; - for(int i = 0; i < WINSZ; i++) + for(int i = 0; i < (2 * RADIUS + 1); i++) { idx = y_tex * im_pitch + x_tex; leftPixel1 = imageL[idx]; @@ -202,23 +208,24 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im diffa[7] += SQ(leftPixel1 - imageR[idx - 7]); y_tex += 1; - } - - col_ssd[0 * SHARED_MEM_SIZE] = diffa[0]; - col_ssd[1 * SHARED_MEM_SIZE] = diffa[1]; - col_ssd[2 * SHARED_MEM_SIZE] = diffa[2]; - col_ssd[3 * SHARED_MEM_SIZE] = diffa[3]; - col_ssd[4 * SHARED_MEM_SIZE] = diffa[4]; - col_ssd[5 * SHARED_MEM_SIZE] = diffa[5]; - col_ssd[6 * SHARED_MEM_SIZE] = diffa[6]; - col_ssd[7 * SHARED_MEM_SIZE] = diffa[7]; + } + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0]; + col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1]; + col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2]; + col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3]; + col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4]; + col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5]; + col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6]; + col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7]; } -extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp) +template +__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp) { extern __shared__ unsigned int col_ssd_cache[]; unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; - unsigned int *col_ssd_extra = threadIdx.x < N_DIRTY_PIXELS ? col_ssd + BLOCK_W : 0; + unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS) //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD) int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp); @@ -237,20 +244,25 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ int end_row = min(ROWSperTHREAD, cheight - Y); int y_tex; int x_tex = X - RADIUS; + + if (x_tex >= cwidth) + return; + for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP) { y_tex = Y - RADIUS; - InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd); + InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd); if (col_ssd_extra > 0) - InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); + if (x_tex + BLOCK_W < cwidth) + InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); __syncthreads(); //before MinSSD function if (X < cwidth - RADIUS && Y < cheight - RADIUS) { - uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd); + uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd); if (minSSD.x < minSSDImage[0]) { disparImage[0] = (unsigned char)(d + minSSD.y); @@ -261,14 +273,15 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ for(int row = 1; row < end_row; row++) { int idx1 = y_tex * img_step + x_tex; - int idx2 = (y_tex + WINSZ) * img_step + x_tex; + int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex; __syncthreads(); - StepDown(idx1, idx2, left, right, d, col_ssd); + StepDown(idx1, idx2, left, right, d, col_ssd); if (col_ssd_extra) - StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra); + if (x_tex + BLOCK_W < cwidth) + StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra); y_tex += 1; @@ -277,7 +290,7 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ if (X < cwidth - RADIUS && row < cheight - RADIUS - Y) { int idx = row * cminSSD_step; - uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd); + uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd); if (minSSD.x < minSSDImage[idx]) { disparImage[disp_pitch * row] = (unsigned char)(d + minSSD.y); @@ -290,24 +303,52 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ } + namespace cv { namespace gpu { namespace impl -{ - extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_& minSSD_buf) - { - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); - //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); - - size_t smem_size = (BLOCK_W + N_DISPARITIES * SHARED_MEM_SIZE) * sizeof(unsigned int); - - cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp. rows) ); - cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp. rows) ); - +{ + template void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp) + { dim3 grid(1,1,1); - dim3 threads(BLOCK_W, 1, 1); + dim3 threads(BLOCK_W, 1, 1); grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W); grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD); + //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) + size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); + + stereobm_gpu::stereoKernel<<>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp); + cudaSafeCall( cudaThreadSynchronize() ); + }; + + typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp); + + const static kernel_caller_t callers[] = + { + 0, + kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>, + kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>, + kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>, + kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>, + kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25> + + //0,0,0, 0,0,0, 0,0,kernel_caller<9> + }; + const int calles_num = sizeof(callers)/sizeof(callers[0]); + + extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_& minSSD_buf) + { + int winsz2 = winsz >> 1; + + if (winsz2 == 0 || winsz2 >= calles_num) + cv::gpu::error("Unsupported window size", __FILE__, __LINE__); + + //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); + //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); + + cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp.rows) ); + cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); + cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cwidth, &left.cols, sizeof(left.cols) ) ); cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cheight, &left.rows, sizeof(left.rows) ) ); cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSDImage, &minSSD_buf.ptr, sizeof(minSSD_buf.ptr) ) ); @@ -315,8 +356,7 @@ namespace cv { namespace gpu { namespace impl size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); - stereobm_gpu::stereoKernel<<>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp); - cudaSafeCall( cudaThreadSynchronize() ); + callers[winsz2](left, right, disp, maxdisp); } }}} @@ -327,18 +367,18 @@ namespace cv { namespace gpu { namespace impl namespace stereobm_gpu { -texture tex; +texture texForSobel; -extern "C" __global__ void prefilert_kernel(unsigned char *output, size_t step, int width, int height, int prefilterCap) +extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step, int width, int height, int prefilterCap) { int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < width && y < height) { - int conv = (int)tex2D(tex, x - 1, y - 1) * (-1) + (int)tex2D(tex, x + 1, y - 1) * (1) + - (int)tex2D(tex, x - 1, y ) * (-2) + (int)tex2D(tex, x + 1, y ) * (2) + - (int)tex2D(tex, x - 1, y + 1) * (-1) + (int)tex2D(tex, x + 1, y + 1) * (1); + int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) + + (int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) + + (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1); conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255); @@ -353,7 +393,7 @@ namespace cv { namespace gpu { namespace impl extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap) { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::tex, input.ptr, desc, input.cols, input.rows, input.step ) ); + cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForSobel, input.ptr, desc, input.cols, input.rows, input.step ) ); dim3 threads(16, 16, 1); dim3 grid(1, 1, 1); @@ -361,8 +401,139 @@ namespace cv { namespace gpu { namespace impl grid.x = divUp(input.cols, threads.x); grid.y = divUp(input.rows, threads.y); - stereobm_gpu::prefilert_kernel<<>>(output.ptr, output.step, output.cols, output.rows, prefilterCap); + stereobm_gpu::prefilter_kernel<<>>(output.ptr, output.step, output.cols, output.rows, prefilterCap); cudaSafeCall( cudaThreadSynchronize() ); + + cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) ); } -}}} \ No newline at end of file +}}} + +////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////// Textureness filtering //////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace stereobm_gpu +{ + +texture texForTF; + +__device__ float sobel(int x, int y) +{ + float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) + + tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) + + tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1); + return fabs(conv); +} + +__device__ float CalcSums(float *cols, float *cols_cache, int winsz) +{ + float cache = 0; + float cache2 = 0; + int winsz2 = winsz/2; + + for(int i = 1; i <= winsz2; i++) + cache += cols[i]; + + cols_cache[0] = cache; + + __syncthreads(); + + if (threadIdx.x < blockDim.x - winsz2) + cache2 = cols_cache[winsz2]; + else + for(int i = winsz2 + 1; i < winsz; i++) + cache2 += cols[i]; + + return cols[0] + cache + cache2; +} + +#define RpT (2 * ROWSperTHREAD) // got experimentally + +extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height) +{ + int winsz2 = winsz/2; + int n_dirty_pixels = (winsz2) * 2; + + extern __shared__ float cols_cache[]; + float *cols = cols_cache + blockDim.x + threadIdx.x; + float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0; + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int beg_row = blockIdx.y * RpT; + int end_row = min(beg_row + RpT, height); + + if (x < width) + { + int y = beg_row; + + float sum = 0; + float sum_extra = 0; + + for(int i = y - winsz2; i <= y + winsz2; ++i) + { + sum += sobel(x - winsz2, i); + if (cols_extra) + sum_extra += sobel(x + blockDim.x - winsz2, i); + } + *cols = sum; + if (cols_extra) + *cols_extra = sum_extra; + + __syncthreads(); + + float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255; + if (sum_win < threshold) + disp[y * disp_step + x] = 0; + + __syncthreads(); + + for(int y = beg_row + 1; y < end_row; ++y) + { + sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2); + *cols = sum; + + if (cols_extra) + { + sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2); + *cols_extra = sum_extra; + } + + __syncthreads(); + float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255; + if (sum_win < threshold) + disp[y * disp_step + x] = 0; + + __syncthreads(); + } + } +} +} + +namespace cv { namespace gpu { namespace impl +{ + extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp) + { + avgTexturenessThreshold *= winsz * winsz; + + stereobm_gpu::texForTF.filterMode = cudaFilterModeLinear; + stereobm_gpu::texForTF.addressMode[0] = cudaAddressModeWrap; + stereobm_gpu::texForTF.addressMode[1] = cudaAddressModeWrap; + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr, desc, input.cols, input.rows, input.step ) ); + + dim3 threads(128, 1, 1); + dim3 grid(1, 1, 1); + + grid.x = divUp(input.cols, threads.x); + grid.y = divUp(input.rows, RpT); + + size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); + + stereobm_gpu::textureness_kernel<<>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows); + cudaSafeCall( cudaThreadSynchronize() ); + + cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForTF) ); + } +}}} diff --git a/modules/gpu/src/stereobm_gpu.cpp b/modules/gpu/src/stereobm_gpu.cpp index c557d4e82..bc52dad7f 100644 --- a/modules/gpu/src/stereobm_gpu.cpp +++ b/modules/gpu/src/stereobm_gpu.cpp @@ -48,12 +48,11 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) cv::gpu::StereoBM_GPU::StereoBM_GPU() { throw_nogpu(); } -cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_) { throw_nogpu(); } +cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_nogpu(); } bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_nogpu(); return false; } -void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity) { throw_nogpu(); } -void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream) { throw_nogpu(); } - +void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, const CudaStream&) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ @@ -61,17 +60,24 @@ namespace cv { namespace gpu { namespace impl { - extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_& minSSD_buf); + extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_& minSSD_buf); extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap = 31); + extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avergeTexThreshold, const DevMem2D& disp); } }} + +const float defaultAvgTexThreshold = 3; -cv::gpu::StereoBM_GPU::StereoBM_GPU() : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ) {} -cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_) : preset(preset_), ndisp(ndisparities_), winSize(winSize_) +cv::gpu::StereoBM_GPU::StereoBM_GPU() + : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold) {} + +cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_) + : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold) { const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8); CV_Assert(0 < ndisp && ndisp <= max_supported_ndisp); CV_Assert(ndisp % 8 == 0); + CV_Assert(winSize % 2 == 1); } bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() @@ -87,7 +93,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() if (major > 1 || numSM > 16) return true; - + return false; } @@ -102,19 +108,22 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right GpuMat le_for_bm = left; GpuMat ri_for_bm = right; - + if (preset == PREFILTER_XSOBEL) { leBuf.create( left.size(), left.type()); riBuf.create(right.size(), right.type()); - + impl::prefilter_xsobel( left, leBuf); impl::prefilter_xsobel(right, riBuf); le_for_bm = leBuf; ri_for_bm = riBuf; } - impl::stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD); + impl::stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD); + + if (avergeTexThreshold) + impl::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity); } void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream)