From b5f717b6b38846449206b53fd56628804d328be0 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Thu, 16 Jan 2014 14:10:17 +0400 Subject: [PATCH 01/10] stereoBM --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 76 +++++++++ modules/calib3d/src/opencl/stereobm.cl | 159 ++++++++++++++++++ modules/calib3d/src/precomp.hpp | 2 + modules/calib3d/src/stereobm.cpp | 120 +++++++++++-- modules/calib3d/test/opencl/test_stereobm.cpp | 96 +++++++++++ 5 files changed, 443 insertions(+), 10 deletions(-) create mode 100644 modules/calib3d/perf/opencl/perf_stereobm.cpp create mode 100644 modules/calib3d/src/opencl/stereobm.cl create mode 100644 modules/calib3d/test/opencl/test_stereobm.cpp diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp new file mode 100644 index 000000000..3352e6b1a --- /dev/null +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -0,0 +1,76 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "perf_precomp.hpp" +#include "opencv2/ts/ocl_perf.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { + +typedef std::tr1::tuple StereoBMFixture_t; +typedef TestBaseWithParam StereoBMFixture; + +OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, 64, 128), OCL_PERF_ENUM(11,21) ) ) +{ + const int n_disp = get<0>(GetParam()), winSize = get<1>(GetParam()); + UMat left, right, disp; + + imread(getDataPath("gpu/stereobm/aloe-L.png"), IMREAD_GRAYSCALE).copyTo(left); + imread(getDataPath("gpu/stereobm/aloe-R.png"), IMREAD_GRAYSCALE).copyTo(right); + ASSERT_FALSE(left.empty()); + ASSERT_FALSE(right.empty()); + + declare.in(left, right); + + Ptr bm = createStereoBM( n_disp, winSize ); + bm->setPreFilterType(bm->PREFILTER_NORMALIZED_RESPONSE); + + OCL_TEST_CYCLE() bm->compute(left, right, disp); + + SANITY_CHECK(disp, 1e-2, ERROR_RELATIVE); +} + +}//ocl +}//cvtest +#endif diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl new file mode 100644 index 000000000..d8f238b89 --- /dev/null +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -0,0 +1,159 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////// stereoBM ////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef SIZE + +__kernel void stereoBM(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, + int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, + int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int wsz2 = winsize/2; + short FILTERED = (mindisp - 1)<<4; + + if(x < cols && y < rows ) + { + int dispIdx = mad24(y, disp_step, disp_offset + x*(int)sizeof(short) ); + __global short * disp = (__global short*)(dispptr + dispIdx); + disp[0] = FILTERED; + if( (x > mindisp+ndisp+wsz2-2) && (y > wsz2-1) && (x < cols-wsz2-mindisp) && (y < rows - wsz2)) + { + int cost[SIZE]; + int textsum = 0; + + for(int d = mindisp; d < ndisp+mindisp; d++) + { + cost[d-mindisp] = 0; + for(int i = -wsz2; i < wsz2+1; i++) + for(int j = -wsz2; j < wsz2+1; j++) + { + textsum += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] - preFilterCap ); + cost[d-mindisp] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x+j-d, cols-1 )] ); + } + } + + int best_disp = mindisp, best_cost = cost[0]; + for(int d = mindisp; d < ndisp+mindisp; d++) + { + best_cost = (cost[d-mindisp] < best_cost) ? cost[d-mindisp] : best_cost; + best_disp = (best_cost == cost[d-mindisp]) ? d : best_disp; + } + + int thresh = best_cost + (best_cost * uniquenessRatio/100); + for(int d = mindisp; (d < ndisp + mindisp) && (uniquenessRatio > 0); d++) + { + best_disp = ( (cost[d-mindisp] <= thresh) && (d < best_disp-1 || d > best_disp + 1) ) ? FILTERED : best_disp; + } + + disp[0] = textsum < textureTreshold ? (FILTERED) : (best_disp == FILTERED) ? (short)(best_disp) : (short)(best_disp); + + if( best_disp != FILTERED ) + { + int y1 = (best_disp > mindisp) ? cost[best_disp-mindisp-1] : cost[best_disp-mindisp+1], + y2 = cost[best_disp-mindisp], + y3 = (best_disp < mindisp+ndisp-1) ? cost[best_disp-mindisp+1] : cost[best_disp-mindisp-1]; + float a = (y3 - ((best_disp+1)*(y2-y1) + best_disp*y1 - (best_disp-1)*y2)/(best_disp - (best_disp-1)) )/ + ((best_disp+1)*((best_disp+1) - (best_disp-1) - best_disp) + (best_disp-1)*best_disp); + float b = (y2 - y1)/(best_disp - (best_disp-1)) - a*((best_disp-1)+best_disp); + disp[0] = (y1 == y2 || y2 == y3) ? (short)(best_disp*16) : (short)(-b/(2*a)*16); + } + } + } +} + +#endif + +////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////// Norm Prefiler //////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +__kernel void prefilter_norm(__global unsigned char *input, __global unsigned char *output, + int rows, int cols, int prefilterCap, int winsize, int scale_g, int scale_s) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int wsz2 = winsize/2; + + if(x < cols && y < rows) + { + int cov1 = input[ max(y-1, 0) * cols + x] * 1 + + input[y * cols + max(x-1,0)] * 1 + input[ y * cols + x] * 4 + input[y * cols + min(x+1, cols-1)] * 1 + + input[min(y+1, rows-1) * cols + x] * 1; + int cov2 = 0; + for(int i = -wsz2; i < wsz2+1; i++) + for(int j = -wsz2; j < wsz2+1; j++) + cov2 += input[min( max( (y+i),0 ),rows-1 ) * cols + min( max( (x+j),0 ),cols-1 )]; + + int res = (cov1*scale_g - cov2*scale_s)>>10; + res = min(min(max(-prefilterCap, res), prefilterCap) + prefilterCap, 255); + output[y * cols + x] = res & 0xFF; + } +} + + +////////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////// Sobel Prefiler //////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////////////// + +__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output, + int rows, int cols, int prefilterCap) +{ + int x = get_global_id(0); + int y = get_global_id(1); + output[y * cols + x] = min(prefilterCap, 255) & 0xFF; + if(x < cols && y < rows-1 && x > 0) + { + int cov = input[((y > 0) ? y-1 : y+1) * cols + (x-1)] * (-1) + input[((y > 0) ? y-1 : y+1) * cols + ((x #include +#include "opencl_kernels.hpp" namespace cv { @@ -85,6 +86,26 @@ struct StereoBMParams int dispType; }; +static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap) +{ + ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc); + if(k.empty()) + return false; + + int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2); + scale_g *= scale_s; + + UMat input = _input.getUMat(), output; + _output.create(input.size(), input.type()); + output = _output.getUMat(); + + size_t globalThreads[3] = { input.cols, input.rows, 1 }; + + k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, + prefilterCap, winsize, scale_g, scale_s); + + return k.run(2, globalThreads, NULL, false); +} static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf ) { @@ -149,6 +170,24 @@ static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uc } } +static bool ocl_prefilter_xsobel(InputArray _input, OutputArray _output, int prefilterCap) +{ + ocl::Kernel k("prefilter_xsobel", ocl::calib3d::stereobm_oclsrc); + if(k.empty()) + return false; + + UMat input = _input.getUMat(), output; + _output.create(input.size(), input.type()); + output = _output.getUMat(); + + size_t blockSize = 1; + size_t globalThreads[3] = { input.cols, input.rows, 1 }; + size_t localThreads[3] = { blockSize, blockSize, 1 }; + + k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, prefilterCap); + + return k.run(2, globalThreads, localThreads, false); +} static void prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) @@ -534,7 +573,6 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp; lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep; rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep; - for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep ) { int lval = lptr[0]; @@ -651,6 +689,25 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, } } +static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state) +{ + if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE ) + { + if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap)) + return false; + if(!ocl_prefilter_norm( right0, right, state->preFilterSize, state->preFilterCap)) + return false; + } + else + { + if(!ocl_prefilter_xsobel( left0, left, state->preFilterCap )) + return false; + if(!ocl_prefilter_xsobel( right0, right, state->preFilterCap)) + return false; + } + return true; +} + struct PrefilterInvoker : public ParallelLoopBody { PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right, @@ -679,6 +736,32 @@ struct PrefilterInvoker : public ParallelLoopBody StereoBMParams* state; }; +static bool ocl_stereo( InputArray _left, InputArray _right, + OutputArray _disp, StereoBMParams* state) +{ + ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); + if(k.empty()) + return false; + + UMat left = _left.getUMat(), right = _right.getUMat(); + _disp.create(_left.size(), CV_16S); + UMat disp = _disp.getUMat(); + + size_t globalThreads[3] = { left.cols, left.rows, 1 }; + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right)); + idx = k.set(idx, ocl::KernelArg::WriteOnly(disp)); + idx = k.set(idx, state->minDisparity); + idx = k.set(idx, state->numDisparities); + idx = k.set(idx, state->preFilterCap); + idx = k.set(idx, state->SADWindowSize); + idx = k.set(idx, state->textureThreshold); + idx = k.set(idx, state->uniquenessRatio); + + return k.run(2, globalThreads, NULL, false); +} struct FindStereoCorrespInvoker : public ParallelLoopBody { @@ -776,21 +859,18 @@ public: void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) { - Mat left0 = leftarr.getMat(), right0 = rightarr.getMat(); int dtype = disparr.fixedType() ? disparr.type() : params.dispType; + Size leftsize = leftarr.size(); - if (left0.size() != right0.size()) + if (leftarr.size() != rightarr.size()) CV_Error( Error::StsUnmatchedSizes, "All the images must have the same size" ); - if (left0.type() != CV_8UC1 || right0.type() != CV_8UC1) + if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1) CV_Error( Error::StsUnsupportedFormat, "Both input images must have CV_8UC1" ); if (dtype != CV_16SC1 && dtype != CV_32FC1) CV_Error( Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" ); - disparr.create(left0.size(), dtype); - Mat disp0 = disparr.getMat(); - if( params.preFilterType != PREFILTER_NORMALIZED_RESPONSE && params.preFilterType != PREFILTER_XSOBEL ) CV_Error( Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" ); @@ -802,7 +882,7 @@ public: CV_Error( Error::StsOutOfRange, "preFilterCap must be within 1..63" ); if( params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 || - params.SADWindowSize >= std::min(left0.cols, left0.rows) ) + params.SADWindowSize >= std::min(leftsize.width, leftsize.height) ) CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" ); if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 ) @@ -814,6 +894,26 @@ public: if( params.uniquenessRatio < 0 ) CV_Error( Error::StsOutOfRange, "uniqueness ratio must be non-negative" ); + int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT; + + if(ocl::useOpenCL() && disparr.isUMat()) + { + UMat left, right; + CV_Assert(ocl_prefiltering(leftarr, rightarr, left, right, ¶ms)); + CV_Assert(ocl_stereo(left, right, disparr, ¶ms)); + + if( params.speckleRange >= 0 && params.speckleWindowSize > 0 ) + filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf); + + if (dtype == CV_32F) + disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << DISPARITY_SHIFT), 0); + return; + } + + Mat left0 = leftarr.getMat(), right0 = rightarr.getMat(); + disparr.create(left0.size(), dtype); + Mat disp0 = disparr.getMat(); + preFilteredImg0.create( left0.size(), CV_8U ); preFilteredImg1.create( left0.size(), CV_8U ); cost.create( left0.size(), CV_16S ); @@ -828,7 +928,6 @@ public: int lofs = std::max(ndisp - 1 + mindisp, 0); int rofs = -std::min(ndisp - 1 + mindisp, 0); int width1 = width - rofs - ndisp + 1; - int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT; if( lofs >= width || rofs >= width || width1 < 1 ) { @@ -855,7 +954,7 @@ public: bufSize2 = width*height*(sizeof(Point_) + sizeof(int) + sizeof(uchar)); #if CV_SSE2 - bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); + bool useShorts = false;//params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); #else const bool useShorts = false; #endif @@ -870,6 +969,7 @@ public: slidingSumBuf.create( 1, bufSize, CV_8U ); uchar *_buf = slidingSumBuf.data; + parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, ¶ms), 1); Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2; diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp new file mode 100644 index 000000000..c3903f6a8 --- /dev/null +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -0,0 +1,96 @@ +/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "test_precomp.hpp" +#include "cvconfig.h" +#include "opencv2/ts/ocl_test.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { + +PARAM_TEST_CASE(StereoBMFixture, int, int) +{ + int n_disp; + int winSize; + Mat left, right, disp; + UMat uleft, uright, udisp; + + virtual void SetUp() + { + n_disp = GET_PARAM(0); + winSize = GET_PARAM(1); + + left = readImage("gpu/stereobm/aloe-L.png", IMREAD_GRAYSCALE); + right = readImage("gpu/stereobm/aloe-R.png", IMREAD_GRAYSCALE); + + ASSERT_FALSE(left.empty()); + ASSERT_FALSE(right.empty()); + + left.copyTo(uleft); + right.copyTo(uright); + } + + void Near(double eps = 0.0) + { + EXPECT_MAT_NEAR_RELATIVE(disp, udisp, eps); + } +}; + +OCL_TEST_P(StereoBMFixture, StereoBM) +{ + Ptr bm = createStereoBM( n_disp, winSize); + bm->setPreFilterType(bm->PREFILTER_XSOBEL); + + OCL_OFF(bm->compute(left, right, disp)); + OCL_ON(bm->compute(uleft, uright, udisp)); + + Near(1e-2); +} + +OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(128), + testing::Values(15))); +}//ocl +}//cvtest + +#endif //HAVE_OPENCL From bfc843a5f53ec05277ee3fd5b7d800bcc52b8984 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Tue, 18 Feb 2014 14:08:22 +0400 Subject: [PATCH 02/10] added optimization --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 2 +- modules/calib3d/src/opencl/stereobm.cl | 95 ++++++++++++++++++- modules/calib3d/src/stereobm.cpp | 42 +++++++- modules/calib3d/test/opencl/test_stereobm.cpp | 6 +- 4 files changed, 138 insertions(+), 7 deletions(-) diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp index 3352e6b1a..dd2bc9e0a 100644 --- a/modules/calib3d/perf/opencl/perf_stereobm.cpp +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -68,7 +68,7 @@ OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, OCL_TEST_CYCLE() bm->compute(left, right, disp); - SANITY_CHECK(disp, 1e-2, ERROR_RELATIVE); + SANITY_CHECK(disp, 0.05, ERROR_RELATIVE); } }//ocl diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index d8f238b89..7ab58dfec 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -45,9 +45,102 @@ ////////////////////////////////////////// stereoBM ////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef csize + +__kernel void stereoBM_opt(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, + int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, + int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) +{ + int total_x = get_global_id(0); + int gx = get_group_id(0), x = gx*ndisp; + int y = get_global_id(1); + int d = get_local_id(0) + mindisp; + int wsz2 = winsize/2; + short FILTERED = (mindisp - 1)<<4; + __local int cost[csize]; + int textsum[tsize]; + if( total_x ndisp-1) && (y > wsz2-1) && (total_x < cols + ndisp - cols%ndisp) && (y < rows - wsz2)) + { + for(; (x <= ndisp+mindisp+wsz2-2); x++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX; + textsum[x%(gx*ndisp)] = INT_MAX; + } + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = 0; + textsum[x%(gx*ndisp)] = 0; + for(int i = -wsz2; i < wsz2+1; i++) + for(int j = -wsz2; j < wsz2+1; j++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x+j-d, cols-1 )] ); + textsum[x%(gx*ndisp)] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] - preFilterCap ); + } + x++; + for(; (x < gx*ndisp + ndisp) && (x < cols-wsz2-mindisp); x++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = cost[(d-mindisp)+ndisp*((x-1)%(gx*ndisp))]; + textsum[x%(gx*ndisp)] = textsum[(x-1)%(gx*ndisp)]; + for(int i = -wsz2; i < wsz2+1; i++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] += -abs( left[min( y+i, rows-1 ) * cols + min( x-wsz2-1, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x-wsz2-1-d, cols-1 )] ) + + abs( left[min( y+i, rows-1 ) * cols + min( x+wsz2, cols-1 )] + - right[min( y+i, rows-1 ) * cols + min( x+wsz2-d, cols-1 )] ); + textsum[x%(gx*ndisp)] += -abs( left[min( y+i, rows-1 ) * cols + min( x-wsz2-1, cols-1 )] - preFilterCap ) + + abs( left[min( y+i, rows-1 ) * cols + min( x+wsz2, cols-1 )] - preFilterCap ); + } + } + + for(; (x > cols - (cols-1)%ndisp - 1) && (x < cols + ndisp - (cols-1)%ndisp - 1); x++) + { + cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX; + textsum[x%(gx*ndisp)] = INT_MAX; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int best_disp = FILTERED, best_cost = INT_MAX-1; + for(int i = 0; (i < ndisp); i++) + { + best_cost = (cost[i + ndisp*(d-mindisp)] < best_cost) ? cost[i + ndisp*(d-mindisp)] : best_cost; + best_disp = (best_cost == cost[i + ndisp*(d-mindisp)]) ? i+mindisp : best_disp; + } + + int thresh = best_cost + (best_cost * uniquenessRatio/100); + for(int i = 0; (i < ndisp) && (uniquenessRatio > 0); i++) + { + best_disp = ( (cost[i + ndisp*(d-mindisp)] <= thresh) && (i < best_disp - mindisp - 1 || i > best_disp - mindisp + 1) ) ? + FILTERED : best_disp; + } + + disp[0] = textsum[d-mindisp] < textureTreshold ? (FILTERED) : (best_disp == FILTERED) ? (short)(best_disp) : (short)(best_disp); + + if( best_disp != FILTERED ) + { + int y1 = (best_disp > mindisp) ? cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)] : + cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)], + y2 = cost[(best_disp-mindisp) + ndisp*(d-mindisp)], + y3 = (best_disp < mindisp+ndisp-1) ? cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)] : + cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)]; + float a = (y3 - ((best_disp+1)*(y2-y1) + best_disp*y1 - (best_disp-1)*y2)/(best_disp - (best_disp-1)) )/ + ((best_disp+1)*((best_disp+1) - (best_disp-1) - best_disp) + (best_disp-1)*best_disp); + float b = (y2 - y1)/(best_disp - (best_disp-1)) - a*((best_disp-1)+best_disp); + disp[0] = (y1 == y2 || y3 == y2) ? (short)(best_disp*16) :(short)(-b/(2*a)*16); + } + } + } +} + +#endif + #ifdef SIZE -__kernel void stereoBM(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, +__kernel void stereoBM_BF(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) { diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 510a457d5..05652f03d 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -736,10 +736,39 @@ struct PrefilterInvoker : public ParallelLoopBody StereoBMParams* state; }; -static bool ocl_stereo( InputArray _left, InputArray _right, +static bool ocl_stereobm_opt( InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) { - ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); + int ndisp = state->numDisparities; + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d", ndisp*ndisp, ndisp) ); + if(k.empty()) + return false; + + UMat left = _left.getUMat(), right = _right.getUMat(); + _disp.create(_left.size(), CV_16S); + UMat disp = _disp.getUMat(); + + size_t globalThreads[3] = { left.cols, left.rows, 1 }; + size_t localThreads[3] = {ndisp, 1, 1}; + + int idx = 0; + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right)); + idx = k.set(idx, ocl::KernelArg::WriteOnly(disp)); + idx = k.set(idx, state->minDisparity); + idx = k.set(idx, ndisp); + idx = k.set(idx, state->preFilterCap); + idx = k.set(idx, state->SADWindowSize); + idx = k.set(idx, state->textureThreshold); + idx = k.set(idx, state->uniquenessRatio); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_stereobm_bf(InputArray _left, InputArray _right, + OutputArray _disp, StereoBMParams* state) +{ + ocl::Kernel k("stereoBM_BF", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); if(k.empty()) return false; @@ -763,6 +792,15 @@ static bool ocl_stereo( InputArray _left, InputArray _right, return k.run(2, globalThreads, NULL, false); } +static bool ocl_stereo(InputArray _left, InputArray _right, + OutputArray _disp, StereoBMParams* state) +{ + if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(int) ) + return ocl_stereobm_opt(_left, _right, _disp, state); + else + return ocl_stereobm_bf(_left, _right, _disp, state); +} + struct FindStereoCorrespInvoker : public ParallelLoopBody { FindStereoCorrespInvoker( const Mat& _left, const Mat& _right, diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index c3903f6a8..15fa93aa4 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -85,11 +85,11 @@ OCL_TEST_P(StereoBMFixture, StereoBM) OCL_OFF(bm->compute(left, right, disp)); OCL_ON(bm->compute(uleft, uright, udisp)); - Near(1e-2); + Near(0.05); } -OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(128), - testing::Values(15))); +OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(32, 64, 128), + testing::Values(11, 21))); }//ocl }//cvtest From 0904f10ab59ae9e1e03655e368b52789be727126 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Tue, 18 Feb 2014 14:24:26 +0400 Subject: [PATCH 03/10] optimizations --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 6 +- modules/calib3d/src/opencl/stereobm.cl | 202 ++++++++++-------- modules/calib3d/src/stereobm.cpp | 25 +-- modules/calib3d/test/opencl/test_stereobm.cpp | 20 +- 4 files changed, 148 insertions(+), 105 deletions(-) diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp index dd2bc9e0a..b795a3526 100644 --- a/modules/calib3d/perf/opencl/perf_stereobm.cpp +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -51,7 +51,7 @@ namespace ocl { typedef std::tr1::tuple StereoBMFixture_t; typedef TestBaseWithParam StereoBMFixture; -OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, 64, 128), OCL_PERF_ENUM(11,21) ) ) +OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, 64), OCL_PERF_ENUM(11,21) ) ) { const int n_disp = get<0>(GetParam()), winSize = get<1>(GetParam()); UMat left, right, disp; @@ -64,11 +64,11 @@ OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, declare.in(left, right); Ptr bm = createStereoBM( n_disp, winSize ); - bm->setPreFilterType(bm->PREFILTER_NORMALIZED_RESPONSE); + bm->setPreFilterType(bm->PREFILTER_XSOBEL); OCL_TEST_CYCLE() bm->compute(left, right, disp); - SANITY_CHECK(disp, 0.05, ERROR_RELATIVE); + SANITY_CHECK(disp, 1e-3, ERROR_RELATIVE); } }//ocl diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index 7ab58dfec..2e74f591c 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -47,90 +47,119 @@ #ifdef csize -__kernel void stereoBM_opt(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, +#define MAX_VAL 32767 + +__kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, - int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) + int preFilterCap, int nthreads, int textureTreshold, int uniquenessRatio) { - int total_x = get_global_id(0); - int gx = get_group_id(0), x = gx*ndisp; - int y = get_global_id(1); - int d = get_local_id(0) + mindisp; - int wsz2 = winsize/2; + int x = get_global_id(0); + int total_y = get_global_id(1); + int z = get_local_id(2); + int d = get_local_id(1); + int gy = get_group_id(1), y = gy*ndisp + z*ndisp/nthreads; + int wsz2 = wsz/2; short FILTERED = (mindisp - 1)<<4; - __local int cost[csize]; - int textsum[tsize]; - if( total_x ndisp-1) && (y > wsz2-1) && (total_x < cols + ndisp - cols%ndisp) && (y < rows - wsz2)) + short costbuf[wsz]; + short textbuf[wsz]; + int head = 0; + + if( (x > ndisp+mindisp+wsz2-2) && (x < cols - wsz2 - mindisp) ) + { + cost += (y < wsz2) ? ndisp*wsz2 : 0; + y = (y cols - (cols-1)%ndisp - 1) && (x < cols + ndisp - (cols-1)%ndisp - 1); x++) + int costdiff = 0, textdiff = 0; + #pragma unroll + for(int j = 0; j < wsz; j++) { - cost[(d-mindisp)+ndisp*(x%(gx*ndisp))] = INT_MAX; - textsum[x%(gx*ndisp)] = INT_MAX; + costdiff += abs( left[0] - right[0] ); + textdiff += abs( left[0] - preFilterCap ); + left++; right++; } - barrier(CLK_LOCAL_MEM_FENCE); + cost[0] += costdiff; + textsum[y-(gy*ndisp)] += textdiff; + costbuf[head] = costdiff; + textbuf[head] = textdiff; + head++; + } + y++; + for(; (y < gy*ndisp + ndisp/nthreads + z*ndisp/nthreads) && (y < rows-wsz2); y++) + { + head = head%wsz; + cost += ndisp; + cost[0] = cost[-ndisp]; + textsum[y-(gy*ndisp)] = textsum[(y-1)-(gy*ndisp)]; + left = leftptr + mad24(y-wsz2-1, cols, x - wsz2); + right = rightptr + mad24(y-wsz2-1, cols, x - wsz2 - d - mindisp); - int best_disp = FILTERED, best_cost = INT_MAX-1; - for(int i = 0; (i < ndisp); i++) + int costdiff = 0, textdiff = 0; + #pragma unroll + for(int i = 0; i < wsz; i++) { - best_cost = (cost[i + ndisp*(d-mindisp)] < best_cost) ? cost[i + ndisp*(d-mindisp)] : best_cost; - best_disp = (best_cost == cost[i + ndisp*(d-mindisp)]) ? i+mindisp : best_disp; + costdiff += + abs( left[wsz*cols] - right[wsz*cols] ); + textdiff += abs( left[wsz*cols] - preFilterCap ); + left++; right++; } + cost[0] += costdiff - costbuf[head]; + textsum[y-(gy*ndisp)] += textdiff - textbuf[head]; + costbuf[head] = costdiff; + textbuf[head] = textdiff; + head++; + } + barrier(CLK_LOCAL_MEM_FENCE); - int thresh = best_cost + (best_cost * uniquenessRatio/100); - for(int i = 0; (i < ndisp) && (uniquenessRatio > 0); i++) + cost = &costFunc[0] + d*ndisp; + short best_disp = FILTERED, best_cost = MAX_VAL-1; + #pragma unroll + for(int i = 0; i < tsize; i++) + { + short c = cost[0]; + best_cost = (c < best_cost) ? c : best_cost; + best_disp = (best_cost == c) ? ndisp - i - 1 : best_disp; + cost++; + } + + cost = &costFunc[0] + d*ndisp; + int thresh = best_cost + (best_cost * uniquenessRatio/100); + #pragma unroll + for(int i = 0; (i < tsize) && (uniquenessRatio > 0); i++) + { + best_disp = ( (cost[0] <= thresh) && (i < (ndisp - best_disp - 2) || i > (ndisp - best_disp) ) ) ? + FILTERED : best_disp; + cost++; + } + + best_disp = (total_y >= rows-wsz2) || (total_y < wsz2) || (textsum[d] < textureTreshold) ? FILTERED : best_disp; + + if( best_disp != FILTERED ) + { + cost = &costFunc[0] + (ndisp - best_disp - 1) + ndisp*d; + int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-1] : cost[1], + y2 = cost[0], + y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[1] : cost[-1]; + d = y3+y1-2*y2 + abs(y3-y1); + if( x < cols && total_y < rows) { - best_disp = ( (cost[i + ndisp*(d-mindisp)] <= thresh) && (i < best_disp - mindisp - 1 || i > best_disp - mindisp + 1) ) ? - FILTERED : best_disp; - } - - disp[0] = textsum[d-mindisp] < textureTreshold ? (FILTERED) : (best_disp == FILTERED) ? (short)(best_disp) : (short)(best_disp); - - if( best_disp != FILTERED ) - { - int y1 = (best_disp > mindisp) ? cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)] : - cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)], - y2 = cost[(best_disp-mindisp) + ndisp*(d-mindisp)], - y3 = (best_disp < mindisp+ndisp-1) ? cost[(best_disp-mindisp+1) + ndisp*(d-mindisp)] : - cost[(best_disp-mindisp-1) + ndisp*(d-mindisp)]; - float a = (y3 - ((best_disp+1)*(y2-y1) + best_disp*y1 - (best_disp-1)*y2)/(best_disp - (best_disp-1)) )/ - ((best_disp+1)*((best_disp+1) - (best_disp-1) - best_disp) + (best_disp-1)*best_disp); - float b = (y2 - y1)/(best_disp - (best_disp-1)) - a*((best_disp-1)+best_disp); - disp[0] = (y1 == y2 || y3 == y2) ? (short)(best_disp*16) :(short)(-b/(2*a)*16); + disp[0] = (short)(((ndisp - best_disp - 1 + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); } } } @@ -148,7 +177,7 @@ __kernel void stereoBM_BF(__global const uchar * left, __global const uchar * ri int y = get_global_id(1); int wsz2 = winsize/2; short FILTERED = (mindisp - 1)<<4; - + if(x < cols && y < rows ) { int dispIdx = mad24(y, disp_step, disp_offset + x*(int)sizeof(short) ); @@ -161,21 +190,20 @@ __kernel void stereoBM_BF(__global const uchar * left, __global const uchar * ri for(int d = mindisp; d < ndisp+mindisp; d++) { - cost[d-mindisp] = 0; + cost[(ndisp-1) - (d - mindisp)] = 0; for(int i = -wsz2; i < wsz2+1; i++) for(int j = -wsz2; j < wsz2+1; j++) { - textsum += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] - preFilterCap ); - cost[d-mindisp] += abs( left[min( y+i, rows-1 ) * cols + min( x+j, cols-1 )] - - right[min( y+i, rows-1 ) * cols + min( x+j-d, cols-1 )] ); + textsum += (d == mindisp) ? abs( left[ (y+i) * cols + x + j] - preFilterCap ) : 0; + cost[(ndisp-1) - (d - mindisp)] += abs(left[(y+i) * cols + x+j] - right[(y+i) * cols + x+j-d] ); } } - int best_disp = mindisp, best_cost = cost[0]; - for(int d = mindisp; d < ndisp+mindisp; d++) + int best_disp = -1, best_cost = INT_MAX; + for(int d = ndisp + mindisp - 1; d > mindisp-1; d--) { best_cost = (cost[d-mindisp] < best_cost) ? cost[d-mindisp] : best_cost; - best_disp = (best_cost == cost[d-mindisp]) ? d : best_disp; + best_disp = (best_cost == cost[d-mindisp]) ? (d) : best_disp; } int thresh = best_cost + (best_cost * uniquenessRatio/100); @@ -191,10 +219,8 @@ __kernel void stereoBM_BF(__global const uchar * left, __global const uchar * ri int y1 = (best_disp > mindisp) ? cost[best_disp-mindisp-1] : cost[best_disp-mindisp+1], y2 = cost[best_disp-mindisp], y3 = (best_disp < mindisp+ndisp-1) ? cost[best_disp-mindisp+1] : cost[best_disp-mindisp-1]; - float a = (y3 - ((best_disp+1)*(y2-y1) + best_disp*y1 - (best_disp-1)*y2)/(best_disp - (best_disp-1)) )/ - ((best_disp+1)*((best_disp+1) - (best_disp-1) - best_disp) + (best_disp-1)*best_disp); - float b = (y2 - y1)/(best_disp - (best_disp-1)) - a*((best_disp-1)+best_disp); - disp[0] = (y1 == y2 || y2 == y3) ? (short)(best_disp*16) : (short)(-b/(2*a)*16); + int _d = y3+y1-2*y2 + abs(y3-y1); + disp[0] = (short)(((ndisp - (best_disp-mindisp) - 1 + mindisp)*256 + (_d != 0 ? (y3-y1)*256/_d : 0) + 15) >> 4); } } } @@ -221,10 +247,10 @@ __kernel void prefilter_norm(__global unsigned char *input, __global unsigned ch int cov2 = 0; for(int i = -wsz2; i < wsz2+1; i++) for(int j = -wsz2; j < wsz2+1; j++) - cov2 += input[min( max( (y+i),0 ),rows-1 ) * cols + min( max( (x+j),0 ),cols-1 )]; + cov2 += input[clamp(y+i, 0, rows-1) * cols + clamp(x+j, 0, cols-1)]; int res = (cov1*scale_g - cov2*scale_s)>>10; - res = min(min(max(-prefilterCap, res), prefilterCap) + prefilterCap, 255); + res = min(clamp(res, -prefilterCap, prefilterCap) + prefilterCap, 255); output[y * cols + x] = res & 0xFF; } } @@ -240,13 +266,13 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned int x = get_global_id(0); int y = get_global_id(1); output[y * cols + x] = min(prefilterCap, 255) & 0xFF; - if(x < cols && y < rows-1 && x > 0) + if(x < cols && y < rows && x > 0 && !((y == rows-1)&(rows%2==1) ) ) { - int cov = input[((y > 0) ? y-1 : y+1) * cols + (x-1)] * (-1) + input[((y > 0) ? y-1 : y+1) * cols + ((x 0) ? y-1 : y+1) * cols + (x-1)] * (-1) + input[ ((y > 0) ? y-1 : y+1) * cols + ((xnumDisparities; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d", ndisp*ndisp, ndisp) ); + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", ndisp*ndisp, ndisp, state->SADWindowSize) ); if(k.empty()) return false; @@ -748,8 +747,9 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, _disp.create(_left.size(), CV_16S); UMat disp = _disp.getUMat(); - size_t globalThreads[3] = { left.cols, left.rows, 1 }; - size_t localThreads[3] = {ndisp, 1, 1}; + int nthreads = (ndisp <= 64) ? 2 : 4; + size_t globalThreads[3] = { left.cols, (left.rows - left.rows%ndisp + ndisp), nthreads}; + size_t localThreads[3] = {1, ndisp, nthreads}; int idx = 0; idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); @@ -758,11 +758,11 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, idx = k.set(idx, state->minDisparity); idx = k.set(idx, ndisp); idx = k.set(idx, state->preFilterCap); - idx = k.set(idx, state->SADWindowSize); + idx = k.set(idx, nthreads); idx = k.set(idx, state->textureThreshold); idx = k.set(idx, state->uniquenessRatio); - return k.run(2, globalThreads, localThreads, false); + return k.run(3, globalThreads, localThreads, false); } static bool ocl_stereobm_bf(InputArray _left, InputArray _right, @@ -790,15 +790,16 @@ static bool ocl_stereobm_bf(InputArray _left, InputArray _right, idx = k.set(idx, state->uniquenessRatio); return k.run(2, globalThreads, NULL, false); + return false; } static bool ocl_stereo(InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) { - if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(int) ) + if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(short) ) return ocl_stereobm_opt(_left, _right, _disp, state); else - return ocl_stereobm_bf(_left, _right, _disp, state); + return false;//ocl_stereobm_bf(_left, _right, _disp, state); } struct FindStereoCorrespInvoker : public ParallelLoopBody @@ -992,7 +993,7 @@ public: bufSize2 = width*height*(sizeof(Point_) + sizeof(int) + sizeof(uchar)); #if CV_SSE2 - bool useShorts = false;//params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); + bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); #else const bool useShorts = false; #endif diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index 15fa93aa4..a683e6938 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -81,11 +81,27 @@ OCL_TEST_P(StereoBMFixture, StereoBM) { Ptr bm = createStereoBM( n_disp, winSize); bm->setPreFilterType(bm->PREFILTER_XSOBEL); +// bm->setMinDisparity(15); + long t1 = clock(); OCL_OFF(bm->compute(left, right, disp)); + long t2 = clock(); OCL_ON(bm->compute(uleft, uright, udisp)); - - Near(0.05); + cv::ocl::finish(); + long t3 = clock(); + std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; +/* + Mat t; absdiff(disp, udisp, t); +/* for(int i = 0; i(i,j) > 0) + if(i>=5 && i <=16 && j == 36+15) + printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) );*/ +/* imshow("diff.png", t*100); + imshow("cv.png", disp*100); + imshow("ocl.png", udisp.getMat(ACCESS_READ)*100); + waitKey(0);*/ + Near(1e-3); } OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(32, 64, 128), From 799d7e7a5093eef28fd37d8e0f86fa55e8814aa3 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Fri, 28 Feb 2014 09:28:07 +0400 Subject: [PATCH 04/10] bad experiment =( --- modules/calib3d/src/opencl/stereobm.cl | 153 +++++++++++------- modules/calib3d/src/stereobm.cpp | 12 +- modules/calib3d/test/opencl/test_stereobm.cpp | 6 +- 3 files changed, 99 insertions(+), 72 deletions(-) diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index 2e74f591c..63765357f 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -49,38 +49,85 @@ #define MAX_VAL 32767 -__kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, - int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, - int preFilterCap, int nthreads, int textureTreshold, int uniquenessRatio) +void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRatio, int textureTreshold, short textsum, int mindisp, int ndisp) { - int x = get_global_id(0); - int total_y = get_global_id(1); - int z = get_local_id(2); - int d = get_local_id(1); - int gy = get_group_id(1), y = gy*ndisp + z*ndisp/nthreads; - int wsz2 = wsz/2; short FILTERED = (mindisp - 1)<<4; - __local short costFunc[csize]; - short textsum[tsize]; - __local short * cost = &costFunc[0] + d + ndisp*ndisp/nthreads*z; - __global uchar * left, * right; - int dispIdx = mad24(total_y, disp_step, disp_offset + x*(int)sizeof(short) ); - __global short * disp = (__global short*)(dispptr + dispIdx); - if( x < cols && total_y < rows) + short best_disp = FILTERED, best_cost = MAX_VAL-1; + __local short * cost; + cost = &costFunc[0]; + #pragma unroll + for(int i = 0; i < tsize/2; i++) { - disp[0] = FILTERED; + short c = cost[0]; + best_cost = (c < best_cost) ? c : best_cost; + best_disp = (best_cost == c) ? ndisp - i - 1 : best_disp; + cost++; } + cost = &costFunc[0]; + int thresh = best_cost + (best_cost * uniquenessRatio/100); + #pragma unroll + for(int i = 0; (i < tsize/2) && (uniquenessRatio > 0); i++) + { + best_disp = ( (cost[0] <= thresh) && (i < (ndisp - best_disp - 2) || i > (ndisp - best_disp) ) ) ? + FILTERED : best_disp; + cost++; + } + + best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; + + if( best_disp != FILTERED ) + { + cost = &costFunc[0] + (ndisp - best_disp - 1); + int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-1] : cost[1], + y2 = cost[0], + y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[1] : cost[-1]; + int d = y3+y1-2*y2 + abs(y3-y1); + disp[0] = (short)best_disp;//(((ndisp - best_disp - 1 + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); + } +} + +__kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, + int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, + int preFilterCap, int textureTreshold, int uniquenessRatio) +{ + int x = get_global_id(0); + int ly = get_local_id(1); + int y = get_global_id(1)*32; + int d = get_local_id(2); + int wsz2 = wsz/2; + short FILTERED = (mindisp - 1)<<4; + __local short costFunc[tsize]; + __local short bestdisp[tsize]; + short textsum; + __local short * cost = &costFunc[0] + d +ly*ndisp; + __global uchar * left, * right; + int dispIdx = mad24(y, disp_step, disp_offset + x*(int)sizeof(short) ); + __global short * disp = (__global short*)(dispptr + dispIdx); + + short best_cost = MAX_VAL-1, best_disp = FILTERED; short costbuf[wsz]; short textbuf[wsz]; int head = 0; + int endy = y+32; + + cost[0] = 0; + bestdisp[d + ly*ndisp] = d; + textsum = 0; + + for(; y < wsz2; y++) + { + disp[0] = FILTERED; + disp += cols; + } + if( x < cols && y < rows) + { + disp[0] = FILTERED; + } + if( (x > ndisp+mindisp+wsz2-2) && (x < cols - wsz2 - mindisp) ) { - cost += (y < wsz2) ? ndisp*wsz2 : 0; - y = (y= wsz2) ) + { + calcDisp(&costFunc[ly*ndisp], &disp[0], uniquenessRatio, textureTreshold, textsum, mindisp, ndisp); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + y++; + cost = &costFunc[0] + d+ly*ndisp; + + for(; (y < endy) && (y wsz2 && (x > ndisp+mindisp+wsz2-2) && (x < cols - wsz2 - mindisp) ) { head = head%wsz; - cost += ndisp; - cost[0] = cost[-ndisp]; - textsum[y-(gy*ndisp)] = textsum[(y-1)-(gy*ndisp)]; left = leftptr + mad24(y-wsz2-1, cols, x - wsz2); right = rightptr + mad24(y-wsz2-1, cols, x - wsz2 - d - mindisp); @@ -120,47 +179,17 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar left++; right++; } cost[0] += costdiff - costbuf[head]; - textsum[y-(gy*ndisp)] += textdiff - textbuf[head]; + textsum += textdiff - textbuf[head]; costbuf[head] = costdiff; textbuf[head] = textdiff; head++; - } - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); - cost = &costFunc[0] + d*ndisp; - short best_disp = FILTERED, best_cost = MAX_VAL-1; - #pragma unroll - for(int i = 0; i < tsize; i++) - { - short c = cost[0]; - best_cost = (c < best_cost) ? c : best_cost; - best_disp = (best_cost == c) ? ndisp - i - 1 : best_disp; - cost++; - } - - cost = &costFunc[0] + d*ndisp; - int thresh = best_cost + (best_cost * uniquenessRatio/100); - #pragma unroll - for(int i = 0; (i < tsize) && (uniquenessRatio > 0); i++) - { - best_disp = ( (cost[0] <= thresh) && (i < (ndisp - best_disp - 2) || i > (ndisp - best_disp) ) ) ? - FILTERED : best_disp; - cost++; - } - - best_disp = (total_y >= rows-wsz2) || (total_y < wsz2) || (textsum[d] < textureTreshold) ? FILTERED : best_disp; - - if( best_disp != FILTERED ) - { - cost = &costFunc[0] + (ndisp - best_disp - 1) + ndisp*d; - int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-1] : cost[1], - y2 = cost[0], - y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[1] : cost[-1]; - d = y3+y1-2*y2 + abs(y3-y1); - if( x < cols && total_y < rows) + if(d == 0) { - disp[0] = (short)(((ndisp - best_disp - 1 + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); + calcDisp(&costFunc[ly*ndisp], &disp[0], uniquenessRatio, textureTreshold, textsum, mindisp, ndisp); } + barrier(CLK_LOCAL_MEM_FENCE); } } } diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 876215b53..67864b07d 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -681,7 +681,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, sad[ndisp] = sad[ndisp-2]; int p = sad[mind+1], n = sad[mind-1]; d = p + n - 2*sad[mind] + std::abs(p - n); - dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4); + dptr[y*dstep] = (short)mind;//(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4); costptr[y*coststep] = sad[mind]; } } @@ -739,7 +739,7 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) {//printf("opt\n"); int ndisp = state->numDisparities; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", ndisp*ndisp, ndisp, state->SADWindowSize) ); + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", ndisp*ndisp, 2*ndisp, state->SADWindowSize) ); if(k.empty()) return false; @@ -747,9 +747,8 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, _disp.create(_left.size(), CV_16S); UMat disp = _disp.getUMat(); - int nthreads = (ndisp <= 64) ? 2 : 4; - size_t globalThreads[3] = { left.cols, (left.rows - left.rows%ndisp + ndisp), nthreads}; - size_t localThreads[3] = {1, ndisp, nthreads}; + size_t globalThreads[3] = { left.cols, (left.rows-left.rows%32 + 32)/32, ndisp}; + size_t localThreads[3] = {1, 2, ndisp}; int idx = 0; idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); @@ -758,7 +757,6 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, idx = k.set(idx, state->minDisparity); idx = k.set(idx, ndisp); idx = k.set(idx, state->preFilterCap); - idx = k.set(idx, nthreads); idx = k.set(idx, state->textureThreshold); idx = k.set(idx, state->uniquenessRatio); @@ -993,7 +991,7 @@ public: bufSize2 = width*height*(sizeof(Point_) + sizeof(int) + sizeof(uchar)); #if CV_SSE2 - bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); + bool useShorts = false;//params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); #else const bool useShorts = false; #endif diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index a683e6938..1852e0dfd 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -90,12 +90,12 @@ OCL_TEST_P(StereoBMFixture, StereoBM) cv::ocl::finish(); long t3 = clock(); std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; -/* + Mat t; absdiff(disp, udisp, t); /* for(int i = 0; i(i,j) > 0) - if(i>=5 && i <=16 && j == 36+15) + if(t.at(i,j) > 0) + // if(i == 125 && j == 174) printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) );*/ /* imshow("diff.png", t*100); imshow("cv.png", disp*100); From a5d989f346098254a66d6d582cedce17915cde00 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Tue, 4 Mar 2014 10:25:07 +0400 Subject: [PATCH 05/10] new attempt --- modules/calib3d/src/opencl/stereobm.cl | 125 +++++++----------- modules/calib3d/src/stereobm.cpp | 9 +- modules/calib3d/test/opencl/test_stereobm.cpp | 8 +- 3 files changed, 56 insertions(+), 86 deletions(-) diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index 63765357f..bca6dd48b 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -56,7 +56,7 @@ void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRat __local short * cost; cost = &costFunc[0]; #pragma unroll - for(int i = 0; i < tsize/2; i++) + for(int i = 0; i < tsize; i++) { short c = cost[0]; best_cost = (c < best_cost) ? c : best_cost; @@ -67,14 +67,14 @@ void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRat cost = &costFunc[0]; int thresh = best_cost + (best_cost * uniquenessRatio/100); #pragma unroll - for(int i = 0; (i < tsize/2) && (uniquenessRatio > 0); i++) + for(int i = 0; (i < tsize) && (uniquenessRatio > 0); i++) { best_disp = ( (cost[0] <= thresh) && (i < (ndisp - best_disp - 2) || i > (ndisp - best_disp) ) ) ? FILTERED : best_disp; cost++; } - best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; +// best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; if( best_disp != FILTERED ) { @@ -92,104 +92,73 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar int preFilterCap, int textureTreshold, int uniquenessRatio) { int x = get_global_id(0); - int ly = get_local_id(1); - int y = get_global_id(1)*32; + int total_y = get_global_id(1); int d = get_local_id(2); + int ly = get_local_id(1); + int gy = get_group_id(1), y = gy*wsz; int wsz2 = wsz/2; short FILTERED = (mindisp - 1)<<4; - __local short costFunc[tsize]; - __local short bestdisp[tsize]; + __local short costFunc[csize]; short textsum; - __local short * cost = &costFunc[0] + d +ly*ndisp; - __global uchar * left, * right; - int dispIdx = mad24(y, disp_step, disp_offset + x*(int)sizeof(short) ); + __local short * cost = costFunc + d; + + __global const uchar * left, * right; + int dispIdx = mad24(total_y, disp_step, disp_offset + x*(int)sizeof(short) ); __global short * disp = (__global short*)(dispptr + dispIdx); short best_cost = MAX_VAL-1, best_disp = FILTERED; short costbuf[wsz]; - short textbuf[wsz]; - int head = 0; - int endy = y+32; - - cost[0] = 0; - bestdisp[d + ly*ndisp] = d; - textsum = 0; - - for(; y < wsz2; y++) - { - disp[0] = FILTERED; - disp += cols; - } - if( x < cols && y < rows) + if( x < cols && total_y < rows) { disp[0] = FILTERED; } if( (x > ndisp+mindisp+wsz2-2) && (x < cols - wsz2 - mindisp) ) { - for(int i = -wsz2; (i < wsz2+1) && (y < rows-wsz2); i++) - { - left = leftptr + mad24(y+i, cols, x-wsz2); - right = rightptr + mad24(y+i, cols, x-wsz2-d-mindisp); + cost[ly*ndisp] = 0; + cost += (y < wsz2) ? ndisp*wsz2 : 0; + y = (y= wsz2) ) + y++; + for(; (y < gy*wsz + wsz) && (y < rows-wsz2); y++) { - calcDisp(&costFunc[ly*ndisp], &disp[0], uniquenessRatio, textureTreshold, textsum, mindisp, ndisp); + cost += ndisp; + left += cols; + right += cols; + costdiff += abs(left[0] - right[0]) - abs(left[(-wsz2-1)*cols] - right[(-wsz2-1)*cols]);//costbuf[(y-1)%wsz]; + for( int i = 0; i < wsz; i++) + { + if(ly == i) + cost[0] += costdiff; + } + barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); - } - - y++; - cost = &costFunc[0] + d+ly*ndisp; - - for(; (y < endy) && (y wsz2 && (x > ndisp+mindisp+wsz2-2) && (x < cols - wsz2 - mindisp) ) +/* + if(total_y >= wsz2 && total_y < rows - wsz2 && d == 0) { - head = head%wsz; - left = leftptr + mad24(y-wsz2-1, cols, x - wsz2); - right = rightptr + mad24(y-wsz2-1, cols, x - wsz2 - d - mindisp); + cost = costFunc + ly*ndisp; + disp[0] = cost[wsz-1]; + }*/ - int costdiff = 0, textdiff = 0; - #pragma unroll - for(int i = 0; i < wsz; i++) - { - costdiff += - abs( left[wsz*cols] - right[wsz*cols] ); - textdiff += abs( left[wsz*cols] - preFilterCap ); - left++; right++; - } - cost[0] += costdiff - costbuf[head]; - textsum += textdiff - textbuf[head]; - costbuf[head] = costdiff; - textbuf[head] = textdiff; - head++; - barrier(CLK_LOCAL_MEM_FENCE); - - if(d == 0) - { - calcDisp(&costFunc[ly*ndisp], &disp[0], uniquenessRatio, textureTreshold, textsum, mindisp, ndisp); - } - barrier(CLK_LOCAL_MEM_FENCE); + if(total_y >= wsz2 && total_y < rows - wsz2 && d == 0) + { + calcDisp(&(costFunc + ly*ndisp)[0], disp, uniquenessRatio, textureTreshold, textsum, mindisp, ndisp); } } } diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 67864b07d..972499e27 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -739,7 +739,8 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) {//printf("opt\n"); int ndisp = state->numDisparities; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", ndisp*ndisp, 2*ndisp, state->SADWindowSize) ); + int wsz = state->SADWindowSize; + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", wsz*ndisp, ndisp, wsz) ); if(k.empty()) return false; @@ -747,8 +748,8 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, _disp.create(_left.size(), CV_16S); UMat disp = _disp.getUMat(); - size_t globalThreads[3] = { left.cols, (left.rows-left.rows%32 + 32)/32, ndisp}; - size_t localThreads[3] = {1, 2, ndisp}; + size_t globalThreads[3] = { left.cols, (left.rows-left.rows%wsz + wsz), ndisp}; + size_t localThreads[3] = {1, wsz, ndisp}; int idx = 0; idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); @@ -797,7 +798,7 @@ static bool ocl_stereo(InputArray _left, InputArray _right, if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(short) ) return ocl_stereobm_opt(_left, _right, _disp, state); else - return false;//ocl_stereobm_bf(_left, _right, _disp, state); + return ocl_stereobm_bf(_left, _right, _disp, state); } struct FindStereoCorrespInvoker : public ParallelLoopBody diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index 1852e0dfd..b6f777653 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -92,11 +92,11 @@ OCL_TEST_P(StereoBMFixture, StereoBM) std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; Mat t; absdiff(disp, udisp, t); -/* for(int i = 0; i(i,j) > 0) - // if(i == 125 && j == 174) - printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) );*/ + // if(t.at(i,j) > 0) + if(i == 5 && j == 38) + printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) ); /* imshow("diff.png", t*100); imshow("cv.png", disp*100); imshow("ocl.png", udisp.getMat(ACCESS_READ)*100); From ddc235172edcd76777f80825828ababc8ccd942c Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Thu, 6 Mar 2014 17:56:42 +0400 Subject: [PATCH 06/10] new approach --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 2 +- modules/calib3d/src/opencl/stereobm.cl | 249 ++++++++++++------ modules/calib3d/src/stereobm.cpp | 41 ++- modules/calib3d/test/opencl/test_stereobm.cpp | 8 +- 4 files changed, 200 insertions(+), 100 deletions(-) diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp index b795a3526..936845a4b 100644 --- a/modules/calib3d/perf/opencl/perf_stereobm.cpp +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -68,7 +68,7 @@ OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, OCL_TEST_CYCLE() bm->compute(left, right, disp); - SANITY_CHECK(disp, 1e-3, ERROR_RELATIVE); + SANITY_CHECK_NOTHING();//(disp, 1e-3, ERROR_RELATIVE); } }//ocl diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index bca6dd48b..7036cdcf3 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -40,6 +40,7 @@ // //M*/ +#pragma OPENCL EXTENSION cl_amd_printf : enable ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////// stereoBM ////////////////////////////////////////////// @@ -49,117 +50,196 @@ #define MAX_VAL 32767 -void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRatio, int textureTreshold, short textsum, int mindisp, int ndisp) +void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRatio/*, int textureTreshold, short textsum*/, + int mindisp, int ndisp, int w, __local short * dispbuf, int d) { short FILTERED = (mindisp - 1)<<4; short best_disp = FILTERED, best_cost = MAX_VAL-1; __local short * cost; - cost = &costFunc[0]; - #pragma unroll - for(int i = 0; i < tsize; i++) - { - short c = cost[0]; - best_cost = (c < best_cost) ? c : best_cost; - best_disp = (best_cost == c) ? ndisp - i - 1 : best_disp; - cost++; - } cost = &costFunc[0]; - int thresh = best_cost + (best_cost * uniquenessRatio/100); - #pragma unroll - for(int i = 0; (i < tsize) && (uniquenessRatio > 0); i++) + dispbuf[d] = d; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int lsize = tsize/2 >> 1; lsize > 0; lsize >>= 1) { - best_disp = ( (cost[0] <= thresh) && (i < (ndisp - best_disp - 2) || i > (ndisp - best_disp) ) ) ? - FILTERED : best_disp; - cost++; + short lid1 = dispbuf[d], lid2 = dispbuf[d+lsize], + cost1 = cost[lid1*w], cost2 = cost[lid2*w]; + if (d < lsize) + { + dispbuf[d] = (cost1 < cost2) ? lid1 : (cost1==cost2) ? max(lid1, lid2) : lid2; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + best_disp = ndisp - dispbuf[0] - 1; + best_cost = costFunc[(ndisp-best_disp-1)*w]; + + int thresh = best_cost + (best_cost * uniquenessRatio/100); + dispbuf[d] = ( (cost[d*w] <= thresh) && (d < (ndisp - best_disp - 2) || d > (ndisp - best_disp) ) ) ? FILTERED : best_disp; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int lsize = tsize/2 >> 1; lsize > 0; lsize >>= 1) + { + short val1 = dispbuf[d], val2 = dispbuf[d+lsize]; + if (d < lsize) + { + dispbuf[d] = min(val1, val2); + } + barrier(CLK_LOCAL_MEM_FENCE); } // best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; - if( best_disp != FILTERED ) + if( dispbuf[0] != FILTERED ) { - cost = &costFunc[0] + (ndisp - best_disp - 1); - int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-1] : cost[1], + cost = &costFunc[0] + (ndisp - best_disp - 1)*w; + int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-w] : cost[w], y2 = cost[0], - y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[1] : cost[-1]; + y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[w] : cost[-w]; int d = y3+y1-2*y2 + abs(y3-y1); - disp[0] = (short)best_disp;//(((ndisp - best_disp - 1 + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); + disp[0] = (short)(((ndisp - best_disp - 1 + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); } } +int calcLocalIdx(int x, int y, int d, int w) +{ + return d*2*w + (w - 1 - y + x); +} + +void calcNewCoordinates(int * x, int * y, int nthread) +{ + int oldX = *x - (1-nthread), oldY = *y; + *x = (oldX == oldY) ? (0*nthread + (oldX + 2)*(1-nthread) ) : (oldX+1)*(1-nthread) + (oldX+1)*nthread; + *y = (oldX == oldY) ? (0*(1-nthread) + (oldY + 1)*nthread) : oldY + 1*(1-nthread); +} + +short calcCostBorder(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, int nthread, + int wsz2, short * costbuf, int * h, int cols, int d, short cost) +{ + int head = (*h)%wsz; + __global const uchar * left, * right; + int idx = mad24(y+wsz2*(2*nthread-1), cols, x+wsz2*(1-2*nthread)); + left = leftptr + idx; + right = rightptr + (idx - d); + + short costdiff = 0; + for(int i = 0; i < wsz; i++) + { + costdiff += abs( left[0] - right[0] ); + left += 1*nthread + cols*(1-nthread); + right += 1*nthread + cols*(1-nthread);// maybe use ? operator + } + cost += costdiff - costbuf[head]; + costbuf[head] = costdiff; + (*h) = (*h)%wsz + 1; + return cost; +} + +short calcCostInside(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, + int wsz2, int cols, int d, short cost_up_left, short cost_up, short cost_left) +{ + __global const uchar * left, * right; + int idx = mad24(y-wsz2-1, cols, x-wsz2-1); + left = leftptr + idx; + right = rightptr + (idx - d); + + return cost_up + cost_left - cost_up_left + abs(left[0] - right[0]) - + abs(left[wsz] - right[wsz]) - abs(left[(wsz)*cols] - right[(wsz)*cols]) + + abs(left[(wsz)*cols + wsz] - right[(wsz)*cols + wsz]); +} + __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, - int preFilterCap, int textureTreshold, int uniquenessRatio) + int preFilterCap, int textureTreshold, int uniquenessRatio, int sizeX, int sizeY) { - int x = get_global_id(0); - int total_y = get_global_id(1); - int d = get_local_id(2); - int ly = get_local_id(1); - int gy = get_group_id(1), y = gy*wsz; + int gx = get_global_id(0)*sizeX; + int gy = get_global_id(1)*sizeY; + int lz = get_local_id(2); + + int nthread = lz/32;// only 0 or 1 + int d = lz%32;// 1 .. 32 int wsz2 = wsz/2; - short FILTERED = (mindisp - 1)<<4; - __local short costFunc[csize]; - short textsum; - __local short * cost = costFunc + d; + __global short * disp; __global const uchar * left, * right; - int dispIdx = mad24(total_y, disp_step, disp_offset + x*(int)sizeof(short) ); - __global short * disp = (__global short*)(dispptr + dispIdx); - short best_cost = MAX_VAL-1, best_disp = FILTERED; + __local short dispbuf[tsize]; + __local short costFunc[csize]; + __local short * cost; + short costbuf[wsz]; + int head = 0; - if( x < cols && total_y < rows) + int shiftX = wsz2 + ndisp + mindisp - 1; + int shiftY = wsz2; + + int x = gx + shiftX, y = gy + shiftY, lx = 0, ly = 0; + + int costIdx = calcLocalIdx(lx, ly, d, sizeY); + cost = costFunc + costIdx; + + short tempcost = 0; + for(int i = 0; i < wsz; i++) { - disp[0] = FILTERED; - } - - if( (x > ndisp+mindisp+wsz2-2) && (x < cols - wsz2 - mindisp) ) - { - cost[ly*ndisp] = 0; - cost += (y < wsz2) ? ndisp*wsz2 : 0; - y = (y= wsz2 && total_y < rows - wsz2 && d == 0) - { - cost = costFunc + ly*ndisp; - disp[0] = cost[wsz-1]; - }*/ - if(total_y >= wsz2 && total_y < rows - wsz2 && d == 0) + for(int j = 0; j < wsz; j++) { - calcDisp(&(costFunc + ly*ndisp)[0], disp, uniquenessRatio, textureTreshold, textsum, mindisp, ndisp); + costdiff += abs( left[0] - right[0] ); + left += 1*nthread + cols*(1-nthread); + right += 1*nthread + cols*(1-nthread);// maybe use ? operator } + if(nthread==1) + { + tempcost += costdiff; + } + costbuf[head] = costdiff; + head++; + } + barrier(CLK_LOCAL_MEM_FENCE); + cost[0] = tempcost; + + if(x < cols-wsz2-mindisp && y < rows-wsz2 && nthread == 1) + { + int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); + disp = (__global short *)(dispptr + dispIdx); + calcDisp(&costFunc[sizeY - 1 + lx - ly], disp, uniquenessRatio, /*textureTreshold, textsum,*/ + mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d); + } + barrier(CLK_LOCAL_MEM_FENCE); + + lx = 1 - nthread; + ly = nthread; + + while(lx < sizeX && ly < sizeY ) + { + x = gx + shiftX + lx; + y = gy + shiftY + ly; + + costIdx = calcLocalIdx(lx, ly, d, sizeY); + cost = costFunc + costIdx; + cost[0] = ( ly*(1-nthread) + lx*nthread == 0 ) ? + calcCostBorder(leftptr, rightptr, x, y, nthread, wsz2, costbuf, &head, cols, d, + costFunc[calcLocalIdx(lx-1*(1-nthread), ly-1*nthread, d, sizeY)]) : + calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d, + costFunc[calcLocalIdx(lx-1, ly-1, d, sizeY)], + costFunc[calcLocalIdx(lx, ly-1, d, sizeY)], + costFunc[calcLocalIdx(lx-1, ly, d, sizeY)]); + barrier(CLK_LOCAL_MEM_FENCE); + + if(x < cols-mindisp-wsz2 && y < rows-wsz2) + { + int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short)); + disp = (__global short *)(dispptr + dispIdx); + calcDisp(&costFunc[sizeY - 1 - ly + lx], disp, uniquenessRatio, //textureTreshold, textsum, + mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d); + } + barrier(CLK_LOCAL_MEM_FENCE); + + calcNewCoordinates(&lx, &ly, nthread); } } @@ -175,8 +255,9 @@ __kernel void stereoBM_BF(__global const uchar * left, __global const uchar * ri int y = get_global_id(1); int wsz2 = winsize/2; short FILTERED = (mindisp - 1)<<4; - + if(x < cols && y < rows ) + { int dispIdx = mad24(y, disp_step, disp_offset + x*(int)sizeof(short) ); __global short * disp = (__global short*)(dispptr + dispIdx); @@ -263,7 +344,11 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned { int x = get_global_id(0); int y = get_global_id(1); - output[y * cols + x] = min(prefilterCap, 255) & 0xFF; + if(x < cols && y < rows) + { + output[y * cols + x] = min(prefilterCap, 255) & 0xFF; + } + if(x < cols && y < rows && x > 0 && !((y == rows-1)&(rows%2==1) ) ) { int cov = input[ ((y > 0) ? y-1 : y+1) * cols + (x-1)] * (-1) + input[ ((y > 0) ? y-1 : y+1) * cols + ((x> 4); + dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4); costptr[y*coststep] = sad[mind]; } } @@ -739,27 +739,43 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) {//printf("opt\n"); int ndisp = state->numDisparities; + int mindisp = state->minDisparity; int wsz = state->SADWindowSize; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", wsz*ndisp, ndisp, wsz) ); + int wsz2 = wsz/2; + + int sizeX = 9, sizeY = sizeX-1, N = ndisp*2; + + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", (2*sizeY)*ndisp, 2*ndisp, wsz) ); if(k.empty()) return false; UMat left = _left.getUMat(), right = _right.getUMat(); - _disp.create(_left.size(), CV_16S); - UMat disp = _disp.getUMat(); + int cols = left.cols, rows = left.rows; - size_t globalThreads[3] = { left.cols, (left.rows-left.rows%wsz + wsz), ndisp}; - size_t localThreads[3] = {1, wsz, ndisp}; + _disp.create(_left.size(), CV_16S); + _disp.setTo((mindisp - 1)<<4); + Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-1-wsz2-mindisp, rows-1-wsz2) ); + UMat disp = (_disp.getUMat())(roi); + + int globalX = disp.cols/sizeX, globalY = disp.rows/sizeY; + globalX += (disp.cols%sizeX) > 0 ? 1 : 0; + globalY += (disp.rows%sizeY) > 0 ? 1 : 0; + size_t globalThreads[3] = { globalX, globalY, N}; + size_t localThreads[3] = {1, 1, N}; int idx = 0; idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right)); - idx = k.set(idx, ocl::KernelArg::WriteOnly(disp)); - idx = k.set(idx, state->minDisparity); + idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp)); + idx = k.set(idx, rows); + idx = k.set(idx, cols); + idx = k.set(idx, mindisp); idx = k.set(idx, ndisp); idx = k.set(idx, state->preFilterCap); idx = k.set(idx, state->textureThreshold); idx = k.set(idx, state->uniquenessRatio); + idx = k.set(idx, sizeX); + idx = k.set(idx, sizeY); return k.run(3, globalThreads, localThreads, false); } @@ -789,16 +805,15 @@ static bool ocl_stereobm_bf(InputArray _left, InputArray _right, idx = k.set(idx, state->uniquenessRatio); return k.run(2, globalThreads, NULL, false); - return false; } static bool ocl_stereo(InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) { - if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(short) ) + //if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(short) ) return ocl_stereobm_opt(_left, _right, _disp, state); - else - return ocl_stereobm_bf(_left, _right, _disp, state); + //else + // return ocl_stereobm_bf(_left, _right, _disp, state); } struct FindStereoCorrespInvoker : public ParallelLoopBody @@ -992,7 +1007,7 @@ public: bufSize2 = width*height*(sizeof(Point_) + sizeof(int) + sizeof(uchar)); #if CV_SSE2 - bool useShorts = false;//params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); + bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2); #else const bool useShorts = false; #endif diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index b6f777653..7b28483d3 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -90,18 +90,18 @@ OCL_TEST_P(StereoBMFixture, StereoBM) cv::ocl::finish(); long t3 = clock(); std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; - +/* Mat t; absdiff(disp, udisp, t); for(int i = 0; i(i,j) > 0) - if(i == 5 && j == 38) + if(t.at(i,j) > 0) + // if(i== 255 && j == 375) printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) ); /* imshow("diff.png", t*100); imshow("cv.png", disp*100); imshow("ocl.png", udisp.getMat(ACCESS_READ)*100); waitKey(0);*/ - Near(1e-3); +// Near(1e-3); } OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(32, 64, 128), From 18a59b48bad90dbbff9211de238279b60bbcbc45 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Wed, 12 Mar 2014 11:20:23 +0400 Subject: [PATCH 07/10] fixes --- modules/calib3d/src/opencl/stereobm.cl | 88 ++++++++++--------- modules/calib3d/test/opencl/test_stereobm.cpp | 6 +- 2 files changed, 49 insertions(+), 45 deletions(-) diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index 7036cdcf3..307b668b3 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -51,7 +51,7 @@ #define MAX_VAL 32767 void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRatio/*, int textureTreshold, short textsum*/, - int mindisp, int ndisp, int w, __local short * dispbuf, int d) + int mindisp, int ndisp, int w, __local short * dispbuf, int d, int x, int y, int cols, int rows, int wsz2) { short FILTERED = (mindisp - 1)<<4; short best_disp = FILTERED, best_cost = MAX_VAL-1; @@ -73,6 +73,7 @@ void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRat } best_disp = ndisp - dispbuf[0] - 1; best_cost = costFunc[(ndisp-best_disp-1)*w]; + barrier(CLK_LOCAL_MEM_FENCE); int thresh = best_cost + (best_cost * uniquenessRatio/100); dispbuf[d] = ( (cost[d*w] <= thresh) && (d < (ndisp - best_disp - 2) || d > (ndisp - best_disp) ) ) ? FILTERED : best_disp; @@ -90,7 +91,7 @@ void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRat // best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; - if( dispbuf[0] != FILTERED ) + if( dispbuf[0] != FILTERED && x < cols-wsz2-mindisp && y < rows-wsz2) { cost = &costFunc[0] + (ndisp - best_disp - 1)*w; int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-w] : cost[w], @@ -179,68 +180,71 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar cost = costFunc + costIdx; short tempcost = 0; - for(int i = 0; i < wsz; i++) + if(x < cols-wsz2-mindisp && y < rows-wsz2) { - int idx = mad24(y-wsz2+i*nthread, cols, x-wsz2+i*(1-nthread)); - left = leftptr + idx; - right = rightptr + (idx - d); - short costdiff = 0; + for(int i = 0; i < wsz; i++) + { + int idx = mad24(y-wsz2+i*nthread, cols, x-wsz2+i*(1-nthread)); + left = leftptr + idx; + right = rightptr + (idx - d); + short costdiff = 0; - for(int j = 0; j < wsz; j++) - { - costdiff += abs( left[0] - right[0] ); - left += 1*nthread + cols*(1-nthread); - right += 1*nthread + cols*(1-nthread);// maybe use ? operator + for(int j = 0; j < wsz; j++) + { + costdiff += abs( left[0] - right[0] ); + left += 1*nthread + cols*(1-nthread); + right += 1*nthread + cols*(1-nthread);// maybe use ? operator + } + if(nthread==1) + { + tempcost += costdiff; + } + costbuf[head] = costdiff; + head++; } - if(nthread==1) - { - tempcost += costdiff; - } - costbuf[head] = costdiff; - head++; } barrier(CLK_LOCAL_MEM_FENCE); cost[0] = tempcost; - if(x < cols-wsz2-mindisp && y < rows-wsz2 && nthread == 1) - { - int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); - disp = (__global short *)(dispptr + dispIdx); - calcDisp(&costFunc[sizeY - 1 + lx - ly], disp, uniquenessRatio, /*textureTreshold, textsum,*/ - mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d); - } + int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); + disp = (__global short *)(dispptr + dispIdx); + calcDisp(&costFunc[sizeY - 1 + lx - ly], disp, uniquenessRatio, /*textureTreshold, textsum,*/ + mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); lx = 1 - nthread; ly = nthread; - while(lx < sizeX && ly < sizeY ) + while(lx < sizeX || ly < sizeY ) { - x = gx + shiftX + lx; - y = gy + shiftY + ly; + x = (lx < sizeX) ? gx + shiftX + lx : cols; + y = (ly < sizeY) ? gy + shiftY + ly : rows; costIdx = calcLocalIdx(lx, ly, d, sizeY); cost = costFunc + costIdx; - cost[0] = ( ly*(1-nthread) + lx*nthread == 0 ) ? - calcCostBorder(leftptr, rightptr, x, y, nthread, wsz2, costbuf, &head, cols, d, - costFunc[calcLocalIdx(lx-1*(1-nthread), ly-1*nthread, d, sizeY)]) : - calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d, - costFunc[calcLocalIdx(lx-1, ly-1, d, sizeY)], - costFunc[calcLocalIdx(lx, ly-1, d, sizeY)], - costFunc[calcLocalIdx(lx-1, ly, d, sizeY)]); + if(x < cols-wsz2-mindisp && y < rows-wsz2 ) + { + cost[0] = ( ly*(1-nthread) + lx*nthread == 0 ) ? + calcCostBorder(leftptr, rightptr, x, y, nthread, wsz2, costbuf, &head, cols, d, + costFunc[calcLocalIdx(lx-1*(1-nthread), ly-1*nthread, d, sizeY)]) : + calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d, + costFunc[calcLocalIdx(lx-1, ly-1, d, sizeY)], + costFunc[calcLocalIdx(lx, ly-1, d, sizeY)], + costFunc[calcLocalIdx(lx-1, ly, d, sizeY)]); + } barrier(CLK_LOCAL_MEM_FENCE); - if(x < cols-mindisp-wsz2 && y < rows-wsz2) - { - int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short)); - disp = (__global short *)(dispptr + dispIdx); - calcDisp(&costFunc[sizeY - 1 - ly + lx], disp, uniquenessRatio, //textureTreshold, textsum, - mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d); - } + int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short)); + disp = (__global short *)(dispptr + dispIdx); + calcDisp(&costFunc[sizeY - 1 - ly + lx], disp, uniquenessRatio, //textureTreshold, textsum, + mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); calcNewCoordinates(&lx, &ly, nthread); } + + + } #endif diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index 7b28483d3..3b034c620 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -90,18 +90,18 @@ OCL_TEST_P(StereoBMFixture, StereoBM) cv::ocl::finish(); long t3 = clock(); std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; -/* + Mat t; absdiff(disp, udisp, t); for(int i = 0; i(i,j) > 0) - // if(i== 255 && j == 375) + // if(i== 12 && j == 44) printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) ); /* imshow("diff.png", t*100); imshow("cv.png", disp*100); imshow("ocl.png", udisp.getMat(ACCESS_READ)*100); waitKey(0);*/ -// Near(1e-3); + Near(1e-3); } OCL_INSTANTIATE_TEST_CASE_P(StereoMatcher, StereoBMFixture, testing::Combine(testing::Values(32, 64, 128), From 91e43342273e1904457df08a62b36c01ed4ca8d7 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Wed, 12 Mar 2014 13:09:57 +0400 Subject: [PATCH 08/10] temp --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 2 +- modules/calib3d/src/opencl/stereobm.cl | 66 +++++++++---------- modules/calib3d/src/stereobm.cpp | 7 +- modules/calib3d/test/opencl/test_stereobm.cpp | 4 +- 4 files changed, 39 insertions(+), 40 deletions(-) diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp index 936845a4b..1b3cf3745 100644 --- a/modules/calib3d/perf/opencl/perf_stereobm.cpp +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -51,7 +51,7 @@ namespace ocl { typedef std::tr1::tuple StereoBMFixture_t; typedef TestBaseWithParam StereoBMFixture; -OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, 64), OCL_PERF_ENUM(11,21) ) ) +OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, 64, 128), OCL_PERF_ENUM(11,21) ) ) { const int n_disp = get<0>(GetParam()), winSize = get<1>(GetParam()); UMat left, right, disp; diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index 307b668b3..e5f553b68 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -50,14 +50,12 @@ #define MAX_VAL 32767 -void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRatio/*, int textureTreshold, short textsum*/, +void calcDisp(__local short * cost, __global short * disp, int uniquenessRatio/*, int textureTreshold, short textsum*/, int mindisp, int ndisp, int w, __local short * dispbuf, int d, int x, int y, int cols, int rows, int wsz2) { short FILTERED = (mindisp - 1)<<4; - short best_disp = FILTERED, best_cost = MAX_VAL-1; - __local short * cost; + short best_disp, best_cost; - cost = &costFunc[0]; dispbuf[d] = d; barrier(CLK_LOCAL_MEM_FENCE); @@ -72,28 +70,23 @@ void calcDisp(__local short * costFunc, __global short * disp, int uniquenessRat barrier(CLK_LOCAL_MEM_FENCE); } best_disp = ndisp - dispbuf[0] - 1; - best_cost = costFunc[(ndisp-best_disp-1)*w]; + best_cost = cost[(ndisp-best_disp-1)*w]; barrier(CLK_LOCAL_MEM_FENCE); int thresh = best_cost + (best_cost * uniquenessRatio/100); - dispbuf[d] = ( (cost[d*w] <= thresh) && (d < (ndisp - best_disp - 2) || d > (ndisp - best_disp) ) ) ? FILTERED : best_disp; + + bool notUniq = ( (cost[d*w] <= thresh) && (d < (ndisp - best_disp - 2) || d > (ndisp - best_disp) ) ); + + if(notUniq) + dispbuf[0] = FILTERED; barrier(CLK_LOCAL_MEM_FENCE); - for(int lsize = tsize/2 >> 1; lsize > 0; lsize >>= 1) - { - short val1 = dispbuf[d], val2 = dispbuf[d+lsize]; - if (d < lsize) - { - dispbuf[d] = min(val1, val2); - } - barrier(CLK_LOCAL_MEM_FENCE); - } // best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; if( dispbuf[0] != FILTERED && x < cols-wsz2-mindisp && y < rows-wsz2) { - cost = &costFunc[0] + (ndisp - best_disp - 1)*w; + cost = &cost[0] + (ndisp - best_disp - 1)*w; int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-w] : cost[w], y2 = cost[0], y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[w] : cost[-w]; @@ -115,7 +108,7 @@ void calcNewCoordinates(int * x, int * y, int nthread) } short calcCostBorder(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, int nthread, - int wsz2, short * costbuf, int * h, int cols, int d, short cost) + int wsz2, short * costbuf, int * h, int cols, int d, short cost, int winsize) { int head = (*h)%wsz; __global const uchar * left, * right; @@ -124,11 +117,12 @@ short calcCostBorder(__global const uchar * leftptr, __global const uchar * righ right = rightptr + (idx - d); short costdiff = 0; - for(int i = 0; i < wsz; i++) + for(int i = 0; i < winsize; i++) { - costdiff += abs( left[0] - right[0] ); - left += 1*nthread + cols*(1-nthread); - right += 1*nthread + cols*(1-nthread);// maybe use ? operator + int shift = 1*nthread + cols*(1-nthread); + costdiff += abs( left[0] - right[0] ); + left += shift; + right += shift; } cost += costdiff - costbuf[head]; costbuf[head] = costdiff; @@ -144,21 +138,25 @@ short calcCostInside(__global const uchar * leftptr, __global const uchar * righ left = leftptr + idx; right = rightptr + (idx - d); - return cost_up + cost_left - cost_up_left + abs(left[0] - right[0]) - - abs(left[wsz] - right[wsz]) - abs(left[(wsz)*cols] - right[(wsz)*cols]) + - abs(left[(wsz)*cols + wsz] - right[(wsz)*cols + wsz]); + uchar corrner1 = abs(left[0] - right[0]), + corrner2 = abs(left[wsz] - right[wsz]), + corrner3 = abs(left[(wsz)*cols] - right[(wsz)*cols]), + corrner4 = abs(left[(wsz)*cols + wsz] - right[(wsz)*cols + wsz]); + + return cost_up + cost_left - cost_up_left + corrner1 - + corrner2 - corrner3 + corrner4; } __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, - int preFilterCap, int textureTreshold, int uniquenessRatio, int sizeX, int sizeY) + int preFilterCap, int textureTreshold, int uniquenessRatio, int sizeX, int sizeY, int winsize) { int gx = get_global_id(0)*sizeX; int gy = get_global_id(1)*sizeY; int lz = get_local_id(2); - int nthread = lz/32;// only 0 or 1 - int d = lz%32;// 1 .. 32 + int nthread = lz/ndisp;// only 0 or 1 + int d = lz%ndisp;// 1 .. ndisp int wsz2 = wsz/2; __global short * disp; @@ -203,19 +201,22 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar head++; } } - barrier(CLK_LOCAL_MEM_FENCE); + if(nthread==1) + { cost[0] = tempcost; + } + barrier(CLK_LOCAL_MEM_FENCE); int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); disp = (__global short *)(dispptr + dispIdx); - calcDisp(&costFunc[sizeY - 1 + lx - ly], disp, uniquenessRatio, /*textureTreshold, textsum,*/ + calcDisp(&costFunc[sizeY - 1 + lx - ly], disp, uniquenessRatio, //textureTreshold, textsum, mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); lx = 1 - nthread; ly = nthread; - while(lx < sizeX || ly < sizeY ) + for(int i = 0; i < iters; i++) { x = (lx < sizeX) ? gx + shiftX + lx : cols; y = (ly < sizeY) ? gy + shiftY + ly : rows; @@ -226,7 +227,7 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar { cost[0] = ( ly*(1-nthread) + lx*nthread == 0 ) ? calcCostBorder(leftptr, rightptr, x, y, nthread, wsz2, costbuf, &head, cols, d, - costFunc[calcLocalIdx(lx-1*(1-nthread), ly-1*nthread, d, sizeY)]) : + costFunc[calcLocalIdx(lx-1*(1-nthread), ly-1*nthread, d, sizeY)], winsize) : calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d, costFunc[calcLocalIdx(lx-1, ly-1, d, sizeY)], costFunc[calcLocalIdx(lx, ly-1, d, sizeY)], @@ -242,9 +243,6 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar calcNewCoordinates(&lx, &ly, nthread); } - - - } #endif diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 93a9643ef..4d28b57d4 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -743,9 +743,9 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, int wsz = state->SADWindowSize; int wsz2 = wsz/2; - int sizeX = 9, sizeY = sizeX-1, N = ndisp*2; + int sizeX = 13, sizeY = sizeX-1, N = ndisp*2; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d", (2*sizeY)*ndisp, 2*ndisp, wsz) ); + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d -D iters=%d", (2*sizeY)*ndisp, 2*ndisp, wsz, sizeX*sizeY/2) ); if(k.empty()) return false; @@ -754,7 +754,7 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, _disp.create(_left.size(), CV_16S); _disp.setTo((mindisp - 1)<<4); - Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-1-wsz2-mindisp, rows-1-wsz2) ); + Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) ); UMat disp = (_disp.getUMat())(roi); int globalX = disp.cols/sizeX, globalY = disp.rows/sizeY; @@ -776,6 +776,7 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, idx = k.set(idx, state->uniquenessRatio); idx = k.set(idx, sizeX); idx = k.set(idx, sizeY); + idx = k.set(idx, wsz); return k.run(3, globalThreads, localThreads, false); } diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index 3b034c620..3acb57e37 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -90,12 +90,12 @@ OCL_TEST_P(StereoBMFixture, StereoBM) cv::ocl::finish(); long t3 = clock(); std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; - + /* Mat t; absdiff(disp, udisp, t); for(int i = 0; i(i,j) > 0) - // if(i== 12 && j == 44) + // if(i == 5 && j == 68) printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) ); /* imshow("diff.png", t*100); imshow("cv.png", disp*100); From af1084eb582669c3f0068966a58cad205a4d6553 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Thu, 13 Mar 2014 09:33:15 +0400 Subject: [PATCH 09/10] speedUp --- modules/calib3d/src/opencl/stereobm.cl | 94 +++++++++---------- modules/calib3d/src/stereobm.cpp | 4 +- modules/calib3d/test/opencl/test_stereobm.cpp | 4 +- 3 files changed, 49 insertions(+), 53 deletions(-) diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index e5f553b68..caca016ee 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -51,47 +51,27 @@ #define MAX_VAL 32767 void calcDisp(__local short * cost, __global short * disp, int uniquenessRatio/*, int textureTreshold, short textsum*/, - int mindisp, int ndisp, int w, __local short * dispbuf, int d, int x, int y, int cols, int rows, int wsz2) + int mindisp, int ndisp, int w, __local int * bestDisp, __local int * bestCost, int d, int x, int y, int cols, int rows, int wsz2) { short FILTERED = (mindisp - 1)<<4; - short best_disp, best_cost; - - dispbuf[d] = d; - barrier(CLK_LOCAL_MEM_FENCE); - - for(int lsize = tsize/2 >> 1; lsize > 0; lsize >>= 1) - { - short lid1 = dispbuf[d], lid2 = dispbuf[d+lsize], - cost1 = cost[lid1*w], cost2 = cost[lid2*w]; - if (d < lsize) - { - dispbuf[d] = (cost1 < cost2) ? lid1 : (cost1==cost2) ? max(lid1, lid2) : lid2; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - best_disp = ndisp - dispbuf[0] - 1; - best_cost = cost[(ndisp-best_disp-1)*w]; - barrier(CLK_LOCAL_MEM_FENCE); + int best_disp = *bestDisp, best_cost = *bestCost, best_disp_back = ndisp - best_disp - 1; int thresh = best_cost + (best_cost * uniquenessRatio/100); - - bool notUniq = ( (cost[d*w] <= thresh) && (d < (ndisp - best_disp - 2) || d > (ndisp - best_disp) ) ); + bool notUniq = ( (cost[0] <= thresh) && (d < (best_disp_back - 1) || d > (best_disp_back + 1) ) ); if(notUniq) - dispbuf[0] = FILTERED; + *bestCost = FILTERED; barrier(CLK_LOCAL_MEM_FENCE); - // best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; - if( dispbuf[0] != FILTERED && x < cols-wsz2-mindisp && y < rows-wsz2) + if( *bestCost != FILTERED && x < cols-wsz2-mindisp && y < rows-wsz2 && d == best_disp_back) { - cost = &cost[0] + (ndisp - best_disp - 1)*w; - int y3 = ((ndisp - best_disp - 1) > 0) ? cost[-w] : cost[w], + int y3 = (best_disp_back > 0) ? cost[-w] : cost[w], y2 = cost[0], - y1 = ((ndisp - best_disp - 1) < ndisp-1) ? cost[w] : cost[-w]; + y1 = (best_disp_back < ndisp-1) ? cost[w] : cost[-w]; int d = y3+y1-2*y2 + abs(y3-y1); - disp[0] = (short)(((ndisp - best_disp - 1 + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); + disp[0] = (short)(((best_disp_back + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); } } @@ -115,11 +95,11 @@ short calcCostBorder(__global const uchar * leftptr, __global const uchar * righ int idx = mad24(y+wsz2*(2*nthread-1), cols, x+wsz2*(1-2*nthread)); left = leftptr + idx; right = rightptr + (idx - d); + int shift = 1*nthread + cols*(1-nthread); short costdiff = 0; for(int i = 0; i < winsize; i++) { - int shift = 1*nthread + cols*(1-nthread); costdiff += abs( left[0] - right[0] ); left += shift; right += shift; @@ -131,7 +111,7 @@ short calcCostBorder(__global const uchar * leftptr, __global const uchar * righ } short calcCostInside(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, - int wsz2, int cols, int d, short cost_up_left, short cost_up, short cost_left) + int wsz2, int cols, int d, short cost_up_left, short cost_up, short cost_left, int winsize) { __global const uchar * left, * right; int idx = mad24(y-wsz2-1, cols, x-wsz2-1); @@ -139,9 +119,9 @@ short calcCostInside(__global const uchar * leftptr, __global const uchar * righ right = rightptr + (idx - d); uchar corrner1 = abs(left[0] - right[0]), - corrner2 = abs(left[wsz] - right[wsz]), - corrner3 = abs(left[(wsz)*cols] - right[(wsz)*cols]), - corrner4 = abs(left[(wsz)*cols + wsz] - right[(wsz)*cols + wsz]); + corrner2 = abs(left[winsize] - right[winsize]), + corrner3 = abs(left[(winsize)*cols] - right[(winsize)*cols]), + corrner4 = abs(left[(winsize)*cols + winsize] - right[(winsize)*cols + winsize]); return cost_up + cost_left - cost_up_left + corrner1 - corrner2 - corrner3 + corrner4; @@ -162,9 +142,11 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar __global short * disp; __global const uchar * left, * right; - __local short dispbuf[tsize]; __local short costFunc[csize]; __local short * cost; + __local int best_disp[2]; + __local int best_cost[2]; + best_cost[nthread] = MAX_VAL; short costbuf[wsz]; int head = 0; @@ -180,18 +162,19 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar short tempcost = 0; if(x < cols-wsz2-mindisp && y < rows-wsz2) { - for(int i = 0; i < wsz; i++) + int shift = 1*nthread + cols*(1-nthread); + for(int i = 0; i < winsize; i++) { int idx = mad24(y-wsz2+i*nthread, cols, x-wsz2+i*(1-nthread)); left = leftptr + idx; right = rightptr + (idx - d); short costdiff = 0; - for(int j = 0; j < wsz; j++) + for(int j = 0; j < winsize; j++) { costdiff += abs( left[0] - right[0] ); - left += 1*nthread + cols*(1-nthread); - right += 1*nthread + cols*(1-nthread);// maybe use ? operator + left += shift; + right += shift; } if(nthread==1) { @@ -203,42 +186,55 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar } if(nthread==1) { - cost[0] = tempcost; + cost[0] = tempcost; + atomic_min(best_cost+nthread, tempcost); } barrier(CLK_LOCAL_MEM_FENCE); + if(best_cost[1] == tempcost) + best_disp[1] = ndisp - d - 1; + barrier(CLK_LOCAL_MEM_FENCE); + int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); disp = (__global short *)(dispptr + dispIdx); - calcDisp(&costFunc[sizeY - 1 + lx - ly], disp, uniquenessRatio, //textureTreshold, textsum, - mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d, x, y, cols, rows, wsz2); + calcDisp(cost, disp, uniquenessRatio, //textureTreshold, textsum, + mindisp, ndisp, 2*sizeY, best_disp + 1, best_cost+1, d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); lx = 1 - nthread; ly = nthread; - for(int i = 0; i < iters; i++) + for(int i = 0; i < sizeY*sizeX/2; i++) { x = (lx < sizeX) ? gx + shiftX + lx : cols; y = (ly < sizeY) ? gy + shiftY + ly : rows; + best_cost[nthread] = MAX_VAL; + barrier(CLK_LOCAL_MEM_FENCE); + costIdx = calcLocalIdx(lx, ly, d, sizeY); cost = costFunc + costIdx; + if(x < cols-wsz2-mindisp && y < rows-wsz2 ) { - cost[0] = ( ly*(1-nthread) + lx*nthread == 0 ) ? + tempcost = ( ly*(1-nthread) + lx*nthread == 0 ) ? calcCostBorder(leftptr, rightptr, x, y, nthread, wsz2, costbuf, &head, cols, d, - costFunc[calcLocalIdx(lx-1*(1-nthread), ly-1*nthread, d, sizeY)], winsize) : + cost[2*nthread-1], winsize) : calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d, - costFunc[calcLocalIdx(lx-1, ly-1, d, sizeY)], - costFunc[calcLocalIdx(lx, ly-1, d, sizeY)], - costFunc[calcLocalIdx(lx-1, ly, d, sizeY)]); + cost[0], cost[1], cost[-1], winsize); + cost[0] = tempcost; + atomic_min(best_cost + nthread, tempcost); } barrier(CLK_LOCAL_MEM_FENCE); + if(best_cost[nthread] == tempcost) + best_disp[nthread] = ndisp - d - 1; + barrier(CLK_LOCAL_MEM_FENCE); + int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short)); disp = (__global short *)(dispptr + dispIdx); - calcDisp(&costFunc[sizeY - 1 - ly + lx], disp, uniquenessRatio, //textureTreshold, textsum, - mindisp, ndisp, 2*sizeY, &dispbuf[nthread*tsize/2], d, x, y, cols, rows, wsz2); + calcDisp(cost, disp, uniquenessRatio, //textureTreshold, textsum, + mindisp, ndisp, 2*sizeY, best_disp + nthread, best_cost + nthread, d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); calcNewCoordinates(&lx, &ly, nthread); diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 4d28b57d4..4d17b473d 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -743,9 +743,9 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, int wsz = state->SADWindowSize; int wsz2 = wsz/2; - int sizeX = 13, sizeY = sizeX-1, N = ndisp*2; + int sizeX = std::max(11, 27 - ocl::Device::getDefault().maxComputeUnits() ), sizeY = sizeX-1, N = ndisp*2; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D tsize=%d -D wsz=%d -D iters=%d", (2*sizeY)*ndisp, 2*ndisp, wsz, sizeX*sizeY/2) ); + ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D wsz=%d", (2*sizeY)*ndisp, wsz) ); if(k.empty()) return false; diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index 3acb57e37..16b3a2a8d 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -94,8 +94,8 @@ OCL_TEST_P(StereoBMFixture, StereoBM) Mat t; absdiff(disp, udisp, t); for(int i = 0; i(i,j) > 0) - // if(i == 5 && j == 68) + // if(t.at(i,j) > 0) + if(i == 5 && j == 36) printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) ); /* imshow("diff.png", t*100); imshow("cv.png", disp*100); From 1a43ed989321df4e6e1867edbeaafc0bc0b010b7 Mon Sep 17 00:00:00 2001 From: Konstantin Matskevich Date: Fri, 14 Mar 2014 13:28:35 +0400 Subject: [PATCH 10/10] finalizing --- modules/calib3d/perf/opencl/perf_stereobm.cpp | 3 +- modules/calib3d/src/opencl/stereobm.cl | 108 ++++-------------- modules/calib3d/src/stereobm.cpp | 64 +++-------- modules/calib3d/test/opencl/test_stereobm.cpp | 19 +-- 4 files changed, 42 insertions(+), 152 deletions(-) diff --git a/modules/calib3d/perf/opencl/perf_stereobm.cpp b/modules/calib3d/perf/opencl/perf_stereobm.cpp index 1b3cf3745..8fca1b894 100644 --- a/modules/calib3d/perf/opencl/perf_stereobm.cpp +++ b/modules/calib3d/perf/opencl/perf_stereobm.cpp @@ -65,10 +65,11 @@ OCL_PERF_TEST_P(StereoBMFixture, StereoBM, ::testing::Combine(OCL_PERF_ENUM(32, Ptr bm = createStereoBM( n_disp, winSize ); bm->setPreFilterType(bm->PREFILTER_XSOBEL); + bm->setTextureThreshold(0); OCL_TEST_CYCLE() bm->compute(left, right, disp); - SANITY_CHECK_NOTHING();//(disp, 1e-3, ERROR_RELATIVE); + SANITY_CHECK(disp, 1e-3, ERROR_RELATIVE); } }//ocl diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl index caca016ee..a746c8950 100644 --- a/modules/calib3d/src/opencl/stereobm.cl +++ b/modules/calib3d/src/opencl/stereobm.cl @@ -40,8 +40,6 @@ // //M*/ -#pragma OPENCL EXTENSION cl_amd_printf : enable - ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////// stereoBM ////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// @@ -50,28 +48,28 @@ #define MAX_VAL 32767 -void calcDisp(__local short * cost, __global short * disp, int uniquenessRatio/*, int textureTreshold, short textsum*/, - int mindisp, int ndisp, int w, __local int * bestDisp, __local int * bestCost, int d, int x, int y, int cols, int rows, int wsz2) +void calcDisp(__local short * cost, __global short * disp, int uniquenessRatio, int mindisp, int ndisp, int w, + __local int * bestDisp, __local int * bestCost, int d, int x, int y, int cols, int rows, int wsz2) { short FILTERED = (mindisp - 1)<<4; int best_disp = *bestDisp, best_cost = *bestCost, best_disp_back = ndisp - best_disp - 1; + short c = cost[0]; + int thresh = best_cost + (best_cost * uniquenessRatio/100); - bool notUniq = ( (cost[0] <= thresh) && (d < (best_disp_back - 1) || d > (best_disp_back + 1) ) ); + bool notUniq = ( (c <= thresh) && (d < (best_disp_back - 1) || d > (best_disp_back + 1) ) ); if(notUniq) *bestCost = FILTERED; barrier(CLK_LOCAL_MEM_FENCE); -// best_disp = (textsum < textureTreshold) ? FILTERED : best_disp; - if( *bestCost != FILTERED && x < cols-wsz2-mindisp && y < rows-wsz2 && d == best_disp_back) { int y3 = (best_disp_back > 0) ? cost[-w] : cost[w], - y2 = cost[0], + y2 = c, y1 = (best_disp_back < ndisp-1) ? cost[w] : cost[-w]; - int d = y3+y1-2*y2 + abs(y3-y1); - disp[0] = (short)(((best_disp_back + mindisp)*256 + (d != 0 ? (y3-y1)*256/d : 0) + 15) >> 4); + int d_aprox = y3+y1-2*y2 + abs(y3-y1); + disp[0] = (short)(((best_disp_back + mindisp)*256 + (d_aprox != 0 ? (y3-y1)*256/d_aprox : 0) + 15) >> 4); } } @@ -111,23 +109,25 @@ short calcCostBorder(__global const uchar * leftptr, __global const uchar * righ } short calcCostInside(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, - int wsz2, int cols, int d, short cost_up_left, short cost_up, short cost_left, int winsize) + int wsz2, int cols, int d, short cost_up_left, short cost_up, short cost_left, + int winsize) { __global const uchar * left, * right; int idx = mad24(y-wsz2-1, cols, x-wsz2-1); left = leftptr + idx; right = rightptr + (idx - d); + int idx2 = winsize*cols; uchar corrner1 = abs(left[0] - right[0]), corrner2 = abs(left[winsize] - right[winsize]), - corrner3 = abs(left[(winsize)*cols] - right[(winsize)*cols]), - corrner4 = abs(left[(winsize)*cols + winsize] - right[(winsize)*cols + winsize]); + corrner3 = abs(left[idx2] - right[idx2]), + corrner4 = abs(left[idx2 + winsize] - right[idx2 + winsize]); return cost_up + cost_left - cost_up_left + corrner1 - corrner2 - corrner3 + corrner4; } -__kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, +__kernel void stereoBM(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, int preFilterCap, int textureTreshold, int uniquenessRatio, int sizeX, int sizeY, int winsize) { @@ -135,8 +135,8 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar int gy = get_global_id(1)*sizeY; int lz = get_local_id(2); - int nthread = lz/ndisp;// only 0 or 1 - int d = lz%ndisp;// 1 .. ndisp + int nthread = lz/ndisp; + int d = lz%ndisp; int wsz2 = wsz/2; __global short * disp; @@ -169,7 +169,6 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar left = leftptr + idx; right = rightptr + (idx - d); short costdiff = 0; - for(int j = 0; j < winsize; j++) { costdiff += abs( left[0] - right[0] ); @@ -197,8 +196,8 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); disp = (__global short *)(dispptr + dispIdx); - calcDisp(cost, disp, uniquenessRatio, //textureTreshold, textsum, - mindisp, ndisp, 2*sizeY, best_disp + 1, best_cost+1, d, x, y, cols, rows, wsz2); + calcDisp(cost, disp, uniquenessRatio, mindisp, ndisp, 2*sizeY, + best_disp + 1, best_cost+1, d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); lx = 1 - nthread; @@ -222,9 +221,9 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar cost[2*nthread-1], winsize) : calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d, cost[0], cost[1], cost[-1], winsize); - cost[0] = tempcost; - atomic_min(best_cost + nthread, tempcost); } + cost[0] = tempcost; + atomic_min(best_cost + nthread, tempcost); barrier(CLK_LOCAL_MEM_FENCE); if(best_cost[nthread] == tempcost) @@ -233,8 +232,9 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short)); disp = (__global short *)(dispptr + dispIdx); - calcDisp(cost, disp, uniquenessRatio, //textureTreshold, textsum, - mindisp, ndisp, 2*sizeY, best_disp + nthread, best_cost + nthread, d, x, y, cols, rows, wsz2); + + calcDisp(cost, disp, uniquenessRatio, mindisp, ndisp, 2*sizeY, + best_disp + nthread, best_cost + nthread, d, x, y, cols, rows, wsz2); barrier(CLK_LOCAL_MEM_FENCE); calcNewCoordinates(&lx, &ly, nthread); @@ -243,68 +243,6 @@ __kernel void stereoBM_opt(__global const uchar * leftptr, __global const uchar #endif -#ifdef SIZE - -__kernel void stereoBM_BF(__global const uchar * left, __global const uchar * right, __global uchar * dispptr, - int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, - int preFilterCap, int winsize, int textureTreshold, int uniquenessRatio) -{ - int x = get_global_id(0); - int y = get_global_id(1); - int wsz2 = winsize/2; - short FILTERED = (mindisp - 1)<<4; - - if(x < cols && y < rows ) - - { - int dispIdx = mad24(y, disp_step, disp_offset + x*(int)sizeof(short) ); - __global short * disp = (__global short*)(dispptr + dispIdx); - disp[0] = FILTERED; - if( (x > mindisp+ndisp+wsz2-2) && (y > wsz2-1) && (x < cols-wsz2-mindisp) && (y < rows - wsz2)) - { - int cost[SIZE]; - int textsum = 0; - - for(int d = mindisp; d < ndisp+mindisp; d++) - { - cost[(ndisp-1) - (d - mindisp)] = 0; - for(int i = -wsz2; i < wsz2+1; i++) - for(int j = -wsz2; j < wsz2+1; j++) - { - textsum += (d == mindisp) ? abs( left[ (y+i) * cols + x + j] - preFilterCap ) : 0; - cost[(ndisp-1) - (d - mindisp)] += abs(left[(y+i) * cols + x+j] - right[(y+i) * cols + x+j-d] ); - } - } - - int best_disp = -1, best_cost = INT_MAX; - for(int d = ndisp + mindisp - 1; d > mindisp-1; d--) - { - best_cost = (cost[d-mindisp] < best_cost) ? cost[d-mindisp] : best_cost; - best_disp = (best_cost == cost[d-mindisp]) ? (d) : best_disp; - } - - int thresh = best_cost + (best_cost * uniquenessRatio/100); - for(int d = mindisp; (d < ndisp + mindisp) && (uniquenessRatio > 0); d++) - { - best_disp = ( (cost[d-mindisp] <= thresh) && (d < best_disp-1 || d > best_disp + 1) ) ? FILTERED : best_disp; - } - - disp[0] = textsum < textureTreshold ? (FILTERED) : (best_disp == FILTERED) ? (short)(best_disp) : (short)(best_disp); - - if( best_disp != FILTERED ) - { - int y1 = (best_disp > mindisp) ? cost[best_disp-mindisp-1] : cost[best_disp-mindisp+1], - y2 = cost[best_disp-mindisp], - y3 = (best_disp < mindisp+ndisp-1) ? cost[best_disp-mindisp+1] : cost[best_disp-mindisp-1]; - int _d = y3+y1-2*y2 + abs(y3-y1); - disp[0] = (short)(((ndisp - (best_disp-mindisp) - 1 + mindisp)*256 + (_d != 0 ? (y3-y1)*256/_d : 0) + 15) >> 4); - } - } - } -} - -#endif - ////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////// Norm Prefiler //////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 4d17b473d..7c06debcb 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -735,9 +735,9 @@ struct PrefilterInvoker : public ParallelLoopBody StereoBMParams* state; }; -static bool ocl_stereobm_opt( InputArray _left, InputArray _right, +static bool ocl_stereobm( InputArray _left, InputArray _right, OutputArray _disp, StereoBMParams* state) -{//printf("opt\n"); +{ int ndisp = state->numDisparities; int mindisp = state->minDisparity; int wsz = state->SADWindowSize; @@ -745,7 +745,7 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, int sizeX = std::max(11, 27 - ocl::Device::getDefault().maxComputeUnits() ), sizeY = sizeX-1, N = ndisp*2; - ocl::Kernel k("stereoBM_opt", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D wsz=%d", (2*sizeY)*ndisp, wsz) ); + ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D wsz=%d", (2*sizeY)*ndisp, wsz) ); if(k.empty()) return false; @@ -781,42 +781,6 @@ static bool ocl_stereobm_opt( InputArray _left, InputArray _right, return k.run(3, globalThreads, localThreads, false); } -static bool ocl_stereobm_bf(InputArray _left, InputArray _right, - OutputArray _disp, StereoBMParams* state) -{ - ocl::Kernel k("stereoBM_BF", ocl::calib3d::stereobm_oclsrc, cv::format("-D SIZE=%d", state->numDisparities ) ); - if(k.empty()) - return false; - - UMat left = _left.getUMat(), right = _right.getUMat(); - _disp.create(_left.size(), CV_16S); - UMat disp = _disp.getUMat(); - - size_t globalThreads[3] = { left.cols, left.rows, 1 }; - - int idx = 0; - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); - idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right)); - idx = k.set(idx, ocl::KernelArg::WriteOnly(disp)); - idx = k.set(idx, state->minDisparity); - idx = k.set(idx, state->numDisparities); - idx = k.set(idx, state->preFilterCap); - idx = k.set(idx, state->SADWindowSize); - idx = k.set(idx, state->textureThreshold); - idx = k.set(idx, state->uniquenessRatio); - - return k.run(2, globalThreads, NULL, false); -} - -static bool ocl_stereo(InputArray _left, InputArray _right, - OutputArray _disp, StereoBMParams* state) -{ - //if(ocl::Device::getDefault().localMemSize() > state->numDisparities * state->numDisparities * sizeof(short) ) - return ocl_stereobm_opt(_left, _right, _disp, state); - //else - // return ocl_stereobm_bf(_left, _right, _disp, state); -} - struct FindStereoCorrespInvoker : public ParallelLoopBody { FindStereoCorrespInvoker( const Mat& _left, const Mat& _right, @@ -950,18 +914,20 @@ public: int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT; - if(ocl::useOpenCL() && disparr.isUMat()) + if(ocl::useOpenCL() && disparr.isUMat() && params.textureThreshold == 0) { UMat left, right; - CV_Assert(ocl_prefiltering(leftarr, rightarr, left, right, ¶ms)); - CV_Assert(ocl_stereo(left, right, disparr, ¶ms)); - - if( params.speckleRange >= 0 && params.speckleWindowSize > 0 ) - filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf); - - if (dtype == CV_32F) - disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << DISPARITY_SHIFT), 0); - return; + if(ocl_prefiltering(leftarr, rightarr, left, right, ¶ms)) + { + if(ocl_stereobm(left, right, disparr, ¶ms)) + { + if( params.speckleRange >= 0 && params.speckleWindowSize > 0 ) + filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf); + if (dtype == CV_32F) + disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << DISPARITY_SHIFT), 0); + return; + } + } } Mat left0 = leftarr.getMat(), right0 = rightarr.getMat(); diff --git a/modules/calib3d/test/opencl/test_stereobm.cpp b/modules/calib3d/test/opencl/test_stereobm.cpp index 16b3a2a8d..636d76cb2 100644 --- a/modules/calib3d/test/opencl/test_stereobm.cpp +++ b/modules/calib3d/test/opencl/test_stereobm.cpp @@ -81,26 +81,11 @@ OCL_TEST_P(StereoBMFixture, StereoBM) { Ptr bm = createStereoBM( n_disp, winSize); bm->setPreFilterType(bm->PREFILTER_XSOBEL); -// bm->setMinDisparity(15); + bm->setTextureThreshold(0); - long t1 = clock(); OCL_OFF(bm->compute(left, right, disp)); - long t2 = clock(); OCL_ON(bm->compute(uleft, uright, udisp)); - cv::ocl::finish(); - long t3 = clock(); - std::cout << (double)(t2-t1)/CLOCKS_PER_SEC << " " << (double)(t3-t2)/CLOCKS_PER_SEC << std::endl; - /* - Mat t; absdiff(disp, udisp, t); - for(int i = 0; i(i,j) > 0) - if(i == 5 && j == 36) - printf("%d %d cv: %d ocl: %d\n", i, j, disp.at(i,j), udisp.getMat(ACCESS_READ).at(i,j) ); -/* imshow("diff.png", t*100); - imshow("cv.png", disp*100); - imshow("ocl.png", udisp.getMat(ACCESS_READ)*100); - waitKey(0);*/ + Near(1e-3); }