|
|
|
@@ -52,6 +52,7 @@
|
|
|
|
|
|
|
|
|
|
#include "precomp.hpp"
|
|
|
|
|
#include <limits.h>
|
|
|
|
|
#include "opencv2/hal/intrin.hpp"
|
|
|
|
|
|
|
|
|
|
namespace cv
|
|
|
|
|
{
|
|
|
|
@@ -110,7 +111,7 @@ struct StereoSGBMParams
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
|
|
|
|
|
For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
|
|
|
|
|
and for each disparity minD<=d<maxD the function
|
|
|
|
|
computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
|
|
|
|
|
row1[x] and row2[x-d]. The subpixel algorithm from
|
|
|
|
@@ -125,7 +126,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
|
|
|
|
|
int tabOfs, int )
|
|
|
|
|
{
|
|
|
|
|
int x, c, width = img1.cols, cn = img1.channels();
|
|
|
|
|
int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
|
|
|
|
|
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
|
|
|
|
int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
|
|
|
|
|
int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
|
|
|
|
|
const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
|
|
|
|
@@ -180,10 +181,6 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
|
|
|
|
|
buffer -= minX2;
|
|
|
|
|
cost -= minX1*D + minD; // simplify the cost indices inside the loop
|
|
|
|
|
|
|
|
|
|
#if CV_SSE2
|
|
|
|
|
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if 1
|
|
|
|
|
for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
|
|
|
|
|
{
|
|
|
|
@@ -211,32 +208,28 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
|
|
|
|
|
int u0 = std::min(ul, ur); u0 = std::min(u0, u);
|
|
|
|
|
int u1 = std::max(ul, ur); u1 = std::max(u1, u);
|
|
|
|
|
|
|
|
|
|
#if CV_SSE2
|
|
|
|
|
if( useSIMD )
|
|
|
|
|
{
|
|
|
|
|
__m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
|
|
|
|
|
__m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
|
|
|
|
|
__m128i ds = _mm_cvtsi32_si128(diff_scale);
|
|
|
|
|
#if CV_SIMD128
|
|
|
|
|
v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
|
|
|
|
|
v_uint8x16 _u1 = v_setall_u8((uchar)u1);
|
|
|
|
|
|
|
|
|
|
for( int d = minD; d < maxD; d += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
|
|
|
|
|
__m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
|
|
|
|
|
__m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
|
|
|
|
|
__m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
|
|
|
|
|
__m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
|
|
|
|
|
__m128i diff = _mm_min_epu8(c0, c1);
|
|
|
|
|
v_uint8x16 _v = v_load(prow2 + width-x-1 + d);
|
|
|
|
|
v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
|
|
|
|
|
v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
|
|
|
|
|
v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
|
|
|
|
|
v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
|
|
|
|
|
v_uint8x16 diff = v_min(c0, c1);
|
|
|
|
|
|
|
|
|
|
c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
|
|
|
|
|
c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
|
|
|
|
|
v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
|
|
|
|
|
v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
|
|
|
|
|
|
|
|
|
|
_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
|
|
|
|
|
_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
|
|
|
|
|
v_uint16x8 diff1,diff2;
|
|
|
|
|
v_expand(diff,diff1,diff2);
|
|
|
|
|
v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
|
|
|
|
|
v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
#endif
|
|
|
|
|
{
|
|
|
|
|
#else
|
|
|
|
|
for( int d = minD; d < maxD; d++ )
|
|
|
|
|
{
|
|
|
|
|
int v = prow2[width-x-1 + d];
|
|
|
|
@@ -247,7 +240,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
|
|
|
|
|
|
|
|
|
|
cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
@@ -340,7 +333,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
|
|
|
|
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
|
|
|
|
|
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
|
|
|
|
|
int k, width = disp1.cols, height = disp1.rows;
|
|
|
|
|
int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);
|
|
|
|
|
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
|
|
|
|
|
int D = maxD - minD, width1 = maxX1 - minX1;
|
|
|
|
|
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
|
|
|
|
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
|
|
|
|
@@ -829,6 +822,645 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
|
|
|
|
|
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
|
|
|
|
|
PixType*& tmpBuf, CostType*& horPassCostVolume,
|
|
|
|
|
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
|
|
|
|
|
CostType*& disp2CostBuf, short*& disp2Buf);
|
|
|
|
|
|
|
|
|
|
struct SGBM3WayMainLoop : public ParallelLoopBody
|
|
|
|
|
{
|
|
|
|
|
Mat* buffers;
|
|
|
|
|
const Mat *img1, *img2;
|
|
|
|
|
Mat* dst_disp;
|
|
|
|
|
|
|
|
|
|
int nstripes, stripe_sz;
|
|
|
|
|
int stripe_overlap;
|
|
|
|
|
|
|
|
|
|
int width,height;
|
|
|
|
|
int minD, maxD, D;
|
|
|
|
|
int minX1, maxX1, width1;
|
|
|
|
|
|
|
|
|
|
int SW2, SH2;
|
|
|
|
|
int P1, P2;
|
|
|
|
|
int uniquenessRatio, disp12MaxDiff;
|
|
|
|
|
|
|
|
|
|
int costBufSize, hsumBufNRows;
|
|
|
|
|
int TAB_OFS, ftzero;
|
|
|
|
|
|
|
|
|
|
PixType* clipTab;
|
|
|
|
|
|
|
|
|
|
SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
|
|
|
|
|
void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
|
|
|
|
|
void operator () (const Range& range) const;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
|
|
|
|
|
buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
|
|
|
|
|
{
|
|
|
|
|
nstripes = _nstripes;
|
|
|
|
|
stripe_overlap = _stripe_overlap;
|
|
|
|
|
stripe_sz = (int)ceil(img1->rows/(double)nstripes);
|
|
|
|
|
|
|
|
|
|
width = img1->cols; height = img1->rows;
|
|
|
|
|
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
|
|
|
|
|
minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
|
|
|
|
|
CV_Assert( D % 16 == 0 );
|
|
|
|
|
|
|
|
|
|
SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
|
|
|
|
|
|
|
|
|
|
P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
|
|
|
|
|
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
|
|
|
|
|
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
|
|
|
|
|
|
|
|
|
|
costBufSize = width1*D;
|
|
|
|
|
hsumBufNRows = SH2*2 + 2;
|
|
|
|
|
TAB_OFS = 256*4;
|
|
|
|
|
ftzero = std::max(params.preFilterCap, 15) | 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
|
|
|
|
|
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
|
|
|
|
|
PixType*& tmpBuf, CostType*& horPassCostVolume,
|
|
|
|
|
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
|
|
|
|
|
CostType*& disp2CostBuf, short*& disp2Buf)
|
|
|
|
|
{
|
|
|
|
|
// allocating all the required memory:
|
|
|
|
|
int costVolumeLineSize = width1*D;
|
|
|
|
|
int width1_ext = width1+2;
|
|
|
|
|
int costVolumeLineSize_ext = width1_ext*D;
|
|
|
|
|
int hsumBufNRows = SH2*2 + 2;
|
|
|
|
|
|
|
|
|
|
// main buffer to store matching costs for the current line:
|
|
|
|
|
int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
|
|
|
|
|
|
|
|
|
|
// auxiliary buffers for the raw matching cost computation:
|
|
|
|
|
int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
|
|
|
|
|
int pixDiffSize = costVolumeLineSize*sizeof(CostType);
|
|
|
|
|
int tmpBufSize = width*16*num_ch*sizeof(PixType);
|
|
|
|
|
|
|
|
|
|
// auxiliary buffers for the matching cost aggregation:
|
|
|
|
|
int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
|
|
|
|
|
int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
|
|
|
|
|
int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line
|
|
|
|
|
int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass
|
|
|
|
|
|
|
|
|
|
// buffers for the pseudo-LRC check:
|
|
|
|
|
int disp2CostBufSize = width*sizeof(CostType);
|
|
|
|
|
int disp2BufSize = width*sizeof(short);
|
|
|
|
|
|
|
|
|
|
// sum up the sizes of all the buffers:
|
|
|
|
|
size_t totalBufSize = curCostVolumeLineSize +
|
|
|
|
|
hsumBufSize +
|
|
|
|
|
pixDiffSize +
|
|
|
|
|
tmpBufSize +
|
|
|
|
|
horPassCostVolumeSize +
|
|
|
|
|
vertPassCostVolumeSize +
|
|
|
|
|
vertPassMinSize +
|
|
|
|
|
rightPassBufSize +
|
|
|
|
|
disp2CostBufSize +
|
|
|
|
|
disp2BufSize +
|
|
|
|
|
16; //to compensate for the alignPtr shifts
|
|
|
|
|
|
|
|
|
|
if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
|
|
|
|
|
buffer.create(1, (int)totalBufSize, CV_8U);
|
|
|
|
|
|
|
|
|
|
// set up all the pointers:
|
|
|
|
|
curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16);
|
|
|
|
|
hsumBuf = curCostVolumeLine + costVolumeLineSize;
|
|
|
|
|
pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;
|
|
|
|
|
tmpBuf = (PixType*)(pixDiff + costVolumeLineSize);
|
|
|
|
|
horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch);
|
|
|
|
|
vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
|
|
|
|
|
rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;
|
|
|
|
|
vertPassMin = rightPassBuf + D;
|
|
|
|
|
disp2CostBuf = vertPassMin + width1_ext;
|
|
|
|
|
disp2Buf = disp2CostBuf + width;
|
|
|
|
|
|
|
|
|
|
// initialize memory:
|
|
|
|
|
memset(buffer.ptr(),0,totalBufSize);
|
|
|
|
|
for(int i=0;i<costVolumeLineSize;i++)
|
|
|
|
|
curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// performing block matching and building raw cost-volume for the current row
|
|
|
|
|
void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
|
|
|
|
|
CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
|
|
|
|
|
int y, int src_start_idx) const
|
|
|
|
|
{
|
|
|
|
|
int x, d;
|
|
|
|
|
int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
|
|
|
|
|
|
|
|
|
|
for(int k = dy1; k <= dy2; k++ )
|
|
|
|
|
{
|
|
|
|
|
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
|
|
|
|
|
if( k < height )
|
|
|
|
|
{
|
|
|
|
|
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
|
|
|
|
|
|
|
|
|
|
memset(hsumAdd, 0, D*sizeof(CostType));
|
|
|
|
|
for(x = 0; x <= SW2*D; x += D )
|
|
|
|
|
{
|
|
|
|
|
int scale = x == 0 ? SW2 + 1 : 1;
|
|
|
|
|
|
|
|
|
|
for( d = 0; d < D; d++ )
|
|
|
|
|
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if( y > src_start_idx )
|
|
|
|
|
{
|
|
|
|
|
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
|
|
|
|
|
|
|
|
|
|
for( x = D; x < width1*D; x += D )
|
|
|
|
|
{
|
|
|
|
|
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
|
|
|
|
|
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128
|
|
|
|
|
v_int16x8 hv_reg;
|
|
|
|
|
for( d = 0; d < D; d+=8 )
|
|
|
|
|
{
|
|
|
|
|
hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
|
|
|
|
|
v_store_aligned(hsumAdd+x+d,hv_reg);
|
|
|
|
|
v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
for( d = 0; d < D; d++ )
|
|
|
|
|
{
|
|
|
|
|
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
|
|
|
|
|
C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
for( x = D; x < width1*D; x += D )
|
|
|
|
|
{
|
|
|
|
|
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
|
|
|
|
|
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
|
|
|
|
|
|
|
|
|
|
for( d = 0; d < D; d++ )
|
|
|
|
|
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if( y == src_start_idx )
|
|
|
|
|
{
|
|
|
|
|
int scale = k == src_start_idx ? SH2 + 1 : 1;
|
|
|
|
|
for( x = 0; x < width1*D; x++ )
|
|
|
|
|
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_SIMD128 && CV_SSE2
|
|
|
|
|
// define some additional reduce operations:
|
|
|
|
|
inline short min(const v_int16x8& a)
|
|
|
|
|
{
|
|
|
|
|
short CV_DECL_ALIGNED(16) buf[8];
|
|
|
|
|
v_store_aligned(buf, a);
|
|
|
|
|
short s0 = std::min(buf[0], buf[1]);
|
|
|
|
|
short s1 = std::min(buf[2], buf[3]);
|
|
|
|
|
short s2 = std::min(buf[4], buf[5]);
|
|
|
|
|
short s3 = std::min(buf[6], buf[7]);
|
|
|
|
|
return std::min(std::min(s0, s1),std::min(s2, s3));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline short min_pos(const v_int16x8& val,const v_int16x8& pos)
|
|
|
|
|
{
|
|
|
|
|
short CV_DECL_ALIGNED(16) val_buf[8];
|
|
|
|
|
v_store_aligned(val_buf, val);
|
|
|
|
|
short CV_DECL_ALIGNED(16) pos_buf[8];
|
|
|
|
|
v_store_aligned(pos_buf, pos);
|
|
|
|
|
short res_pos = 0;
|
|
|
|
|
short min_val = SHRT_MAX;
|
|
|
|
|
if(val_buf[0]<min_val) {min_val=val_buf[0]; res_pos=pos_buf[0];}
|
|
|
|
|
if(val_buf[1]<min_val) {min_val=val_buf[1]; res_pos=pos_buf[1];}
|
|
|
|
|
if(val_buf[2]<min_val) {min_val=val_buf[2]; res_pos=pos_buf[2];}
|
|
|
|
|
if(val_buf[3]<min_val) {min_val=val_buf[3]; res_pos=pos_buf[3];}
|
|
|
|
|
if(val_buf[4]<min_val) {min_val=val_buf[4]; res_pos=pos_buf[4];}
|
|
|
|
|
if(val_buf[5]<min_val) {min_val=val_buf[5]; res_pos=pos_buf[5];}
|
|
|
|
|
if(val_buf[6]<min_val) {min_val=val_buf[6]; res_pos=pos_buf[6];}
|
|
|
|
|
if(val_buf[7]<min_val) {min_val=val_buf[7]; res_pos=pos_buf[7];}
|
|
|
|
|
return res_pos;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// performing SGM cost accumulation from left to right (result is stored in leftBuf) and
|
|
|
|
|
// in-place cost accumulation from top to bottom (result is stored in topBuf)
|
|
|
|
|
inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
|
|
|
|
|
CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
|
|
|
|
|
{
|
|
|
|
|
#if CV_SIMD128 && CV_SSE2
|
|
|
|
|
v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
|
|
|
|
|
|
|
|
|
|
v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
|
|
|
|
|
v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev);
|
|
|
|
|
|
|
|
|
|
v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
|
|
|
|
|
v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 src1_topBuf = v_load_aligned(topBuf);
|
|
|
|
|
|
|
|
|
|
v_int16x8 src2;
|
|
|
|
|
v_int16x8 src_shifted_left,src_shifted_right;
|
|
|
|
|
v_int16x8 res;
|
|
|
|
|
|
|
|
|
|
for(int i=0;i<D-8;i+=8)
|
|
|
|
|
{
|
|
|
|
|
//process leftBuf:
|
|
|
|
|
//lookahead load:
|
|
|
|
|
src2 = v_load_aligned(leftBuf_prev+i+8);
|
|
|
|
|
|
|
|
|
|
//get shifted versions of the current block:
|
|
|
|
|
src_shifted_left = v_int16x8(_mm_slli_si128(src1_leftBuf.val, 2));
|
|
|
|
|
src_shifted_right = v_int16x8(_mm_srli_si128(src1_leftBuf.val, 2));
|
|
|
|
|
|
|
|
|
|
// replace shifted-in zeros with proper values and add P1:
|
|
|
|
|
src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_leftBuf.val, 14)))+P1_reg;
|
|
|
|
|
src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val, 14)))+P1_reg;
|
|
|
|
|
|
|
|
|
|
// process and save current block:
|
|
|
|
|
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
|
|
|
|
|
leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
|
|
|
|
|
v_store_aligned(leftBuf+i, res);
|
|
|
|
|
|
|
|
|
|
//update src buffers:
|
|
|
|
|
src0_leftBuf = src1_leftBuf;
|
|
|
|
|
src1_leftBuf = src2;
|
|
|
|
|
|
|
|
|
|
//process topBuf:
|
|
|
|
|
//lookahead load:
|
|
|
|
|
src2 = v_load_aligned(topBuf+i+8);
|
|
|
|
|
|
|
|
|
|
//get shifted versions of the current block:
|
|
|
|
|
src_shifted_left = v_int16x8(_mm_slli_si128(src1_topBuf.val, 2));
|
|
|
|
|
src_shifted_right = v_int16x8(_mm_srli_si128(src1_topBuf.val, 2));
|
|
|
|
|
|
|
|
|
|
// replace shifted-in zeros with proper values and add P1:
|
|
|
|
|
src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_topBuf.val, 14)))+P1_reg;
|
|
|
|
|
src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg;
|
|
|
|
|
|
|
|
|
|
// process and save current block:
|
|
|
|
|
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
|
|
|
|
|
topMinCost_new_reg = v_min(topMinCost_new_reg,res);
|
|
|
|
|
v_store_aligned(topBuf+i, res);
|
|
|
|
|
|
|
|
|
|
//update src buffers:
|
|
|
|
|
src0_topBuf = src1_topBuf;
|
|
|
|
|
src1_topBuf = src2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// a bit different processing for the last cycle of the loop:
|
|
|
|
|
//process leftBuf:
|
|
|
|
|
src_shifted_left = v_int16x8(_mm_slli_si128(src1_leftBuf.val, 2));
|
|
|
|
|
src_shifted_right = v_int16x8(_mm_srli_si128(src1_leftBuf.val, 2));
|
|
|
|
|
|
|
|
|
|
src2 = v_setall_s16(SHRT_MAX);
|
|
|
|
|
|
|
|
|
|
src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_leftBuf.val, 14)))+P1_reg;
|
|
|
|
|
src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg;
|
|
|
|
|
|
|
|
|
|
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
|
|
|
|
|
leftMinCost = min(v_min(leftMinCost_new_reg,res));
|
|
|
|
|
v_store_aligned(leftBuf+D-8, res);
|
|
|
|
|
|
|
|
|
|
//process topBuf:
|
|
|
|
|
src_shifted_left = v_int16x8(_mm_slli_si128(src1_topBuf.val, 2));
|
|
|
|
|
src_shifted_right = v_int16x8(_mm_srli_si128(src1_topBuf.val, 2));
|
|
|
|
|
|
|
|
|
|
src2 = v_setall_s16(SHRT_MAX);
|
|
|
|
|
|
|
|
|
|
src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_topBuf.val, 14)))+P1_reg;
|
|
|
|
|
src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg;
|
|
|
|
|
|
|
|
|
|
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
|
|
|
|
|
topMinCost = min(v_min(topMinCost_new_reg,res));
|
|
|
|
|
v_store_aligned(topBuf+D-8, res);
|
|
|
|
|
#else
|
|
|
|
|
CostType leftMinCost_new = SHRT_MAX;
|
|
|
|
|
CostType topMinCost_new = SHRT_MAX;
|
|
|
|
|
int leftMinCost_P2 = leftMinCost + P2;
|
|
|
|
|
int topMinCost_P2 = topMinCost + P2;
|
|
|
|
|
CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
|
|
|
|
|
CostType topBuf_i_minus_1 = SHRT_MAX;
|
|
|
|
|
CostType tmp;
|
|
|
|
|
|
|
|
|
|
for(int i=0;i<D-1;i++)
|
|
|
|
|
{
|
|
|
|
|
leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
|
|
|
|
|
leftBuf_prev_i_minus_1 = leftBuf_prev[i];
|
|
|
|
|
leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
|
|
|
|
|
|
|
|
|
|
tmp = topBuf[i];
|
|
|
|
|
topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
|
|
|
|
|
topBuf_i_minus_1 = tmp;
|
|
|
|
|
topMinCost_new = std::min(topMinCost_new,topBuf[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
|
|
|
|
|
leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
|
|
|
|
|
|
|
|
|
|
topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
|
|
|
|
|
topMinCost = std::min(topMinCost_new,topBuf[D-1]);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
|
|
|
|
|
// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
|
|
|
|
|
// optimal disparity value with minimum accumulated cost
|
|
|
|
|
inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
|
|
|
|
|
CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
|
|
|
|
|
{
|
|
|
|
|
#if CV_SIMD128 && CV_SSE2
|
|
|
|
|
v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
|
|
|
|
|
|
|
|
|
|
v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
|
|
|
|
|
v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 src1_rightBuf = v_load(rightBuf);
|
|
|
|
|
|
|
|
|
|
v_int16x8 src2;
|
|
|
|
|
v_int16x8 src_shifted_left,src_shifted_right;
|
|
|
|
|
v_int16x8 res;
|
|
|
|
|
|
|
|
|
|
v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
|
|
|
|
|
v_int16x8 min_sum_pos_reg = v_setall_s16(0);
|
|
|
|
|
v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
|
|
|
|
|
v_int16x8 eight_reg = v_setall_s16(8);
|
|
|
|
|
|
|
|
|
|
for(int i=0;i<D-8;i+=8)
|
|
|
|
|
{
|
|
|
|
|
//lookahead load:
|
|
|
|
|
src2 = v_load_aligned(rightBuf+i+8);
|
|
|
|
|
|
|
|
|
|
//get shifted versions of the current block:
|
|
|
|
|
src_shifted_left = v_int16x8(_mm_slli_si128(src1_rightBuf.val, 2));
|
|
|
|
|
src_shifted_right = v_int16x8(_mm_srli_si128(src1_rightBuf.val, 2));
|
|
|
|
|
|
|
|
|
|
// replace shifted-in zeros with proper values and add P1:
|
|
|
|
|
src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_rightBuf.val, 14)))+P1_reg;
|
|
|
|
|
src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg;
|
|
|
|
|
|
|
|
|
|
// process and save current block:
|
|
|
|
|
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
|
|
|
|
|
rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
|
|
|
|
|
v_store_aligned(rightBuf+i, res);
|
|
|
|
|
|
|
|
|
|
// compute and save total cost:
|
|
|
|
|
res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
|
|
|
|
|
v_store_aligned(leftBuf+i, res);
|
|
|
|
|
|
|
|
|
|
// track disparity value with the minimum cost:
|
|
|
|
|
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
|
|
|
|
|
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
|
|
|
|
|
loop_idx = loop_idx+eight_reg;
|
|
|
|
|
|
|
|
|
|
//update src:
|
|
|
|
|
src0_rightBuf = src1_rightBuf;
|
|
|
|
|
src1_rightBuf = src2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// a bit different processing for the last cycle of the loop:
|
|
|
|
|
src_shifted_left = v_int16x8(_mm_slli_si128(src1_rightBuf.val, 2));
|
|
|
|
|
src_shifted_right = v_int16x8(_mm_srli_si128(src1_rightBuf.val, 2));
|
|
|
|
|
|
|
|
|
|
src2 = v_setall_s16(SHRT_MAX);
|
|
|
|
|
|
|
|
|
|
src_shifted_left = (src_shifted_left | v_int16x8(_mm_srli_si128(src0_rightBuf.val, 14)))+P1_reg;
|
|
|
|
|
src_shifted_right = (src_shifted_right | v_int16x8(_mm_slli_si128(src2.val , 14)))+P1_reg;
|
|
|
|
|
|
|
|
|
|
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
|
|
|
|
|
rightMinCost = min(v_min(rightMinCost_new_reg,res));
|
|
|
|
|
v_store_aligned(rightBuf+D-8, res);
|
|
|
|
|
|
|
|
|
|
res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
|
|
|
|
|
v_store_aligned(leftBuf+D-8, res);
|
|
|
|
|
|
|
|
|
|
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
|
|
|
|
|
min_cost = min(min_sum_cost_reg);
|
|
|
|
|
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
|
|
|
|
|
optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg);
|
|
|
|
|
#else
|
|
|
|
|
CostType rightMinCost_new = SHRT_MAX;
|
|
|
|
|
int rightMinCost_P2 = rightMinCost + P2;
|
|
|
|
|
CostType rightBuf_i_minus_1 = SHRT_MAX;
|
|
|
|
|
CostType tmp;
|
|
|
|
|
min_cost = SHRT_MAX;
|
|
|
|
|
|
|
|
|
|
for(int i=0;i<D-1;i++)
|
|
|
|
|
{
|
|
|
|
|
tmp = rightBuf[i];
|
|
|
|
|
rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
|
|
|
|
|
rightBuf_i_minus_1 = tmp;
|
|
|
|
|
rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]);
|
|
|
|
|
leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
|
|
|
|
|
if(leftBuf[i]<min_cost)
|
|
|
|
|
{
|
|
|
|
|
optimal_disp = i;
|
|
|
|
|
min_cost = leftBuf[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
|
|
|
|
|
rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]);
|
|
|
|
|
leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
|
|
|
|
|
if(leftBuf[D-1]<min_cost)
|
|
|
|
|
{
|
|
|
|
|
optimal_disp = D-1;
|
|
|
|
|
min_cost = leftBuf[D-1];
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SGBM3WayMainLoop::operator () (const Range& range) const
|
|
|
|
|
{
|
|
|
|
|
// force separate processing of stripes:
|
|
|
|
|
if(range.end>range.start+1)
|
|
|
|
|
{
|
|
|
|
|
for(int n=range.start;n<range.end;n++)
|
|
|
|
|
(*this)(Range(n,n+1));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
|
|
|
|
|
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
|
|
|
|
|
|
|
|
|
|
// setting up the ranges:
|
|
|
|
|
int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
|
|
|
|
|
int src_end_idx = std::min(range.end * stripe_sz, height);
|
|
|
|
|
|
|
|
|
|
int dst_offset;
|
|
|
|
|
if(range.start==0)
|
|
|
|
|
dst_offset=stripe_overlap;
|
|
|
|
|
else
|
|
|
|
|
dst_offset=0;
|
|
|
|
|
|
|
|
|
|
Mat cur_buffer = buffers [range.start];
|
|
|
|
|
Mat cur_disp = dst_disp[range.start];
|
|
|
|
|
cur_disp = Scalar(INVALID_DISP_SCALED);
|
|
|
|
|
|
|
|
|
|
// prepare buffers:
|
|
|
|
|
CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
|
|
|
|
|
PixType* tmpBuf;
|
|
|
|
|
CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
|
|
|
|
|
short* disp2Buf;
|
|
|
|
|
getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
|
|
|
|
|
curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
|
|
|
|
|
vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
|
|
|
|
|
|
|
|
|
|
// start real processing:
|
|
|
|
|
for(int y=src_start_idx;y<src_end_idx;y++)
|
|
|
|
|
{
|
|
|
|
|
getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
|
|
|
|
|
|
|
|
|
|
short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
|
|
|
|
|
|
|
|
|
|
// initialize the auxiliary buffers for the pseudo left-right consistency check:
|
|
|
|
|
for(int x=0;x<width;x++)
|
|
|
|
|
{
|
|
|
|
|
disp2Buf[x] = (short)INVALID_DISP_SCALED;
|
|
|
|
|
disp2CostBuf[x] = SHRT_MAX;
|
|
|
|
|
}
|
|
|
|
|
CostType* C = curCostVolumeLine - D;
|
|
|
|
|
CostType prev_min, min_cost;
|
|
|
|
|
int d, best_d;
|
|
|
|
|
d = best_d = 0;
|
|
|
|
|
|
|
|
|
|
// forward pass
|
|
|
|
|
prev_min=0;
|
|
|
|
|
for (int x=D;x<(1+width1)*D;x+=D)
|
|
|
|
|
accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
|
|
|
|
|
|
|
|
|
|
//backward pass
|
|
|
|
|
memset(rightPassBuf,0,D*sizeof(CostType));
|
|
|
|
|
prev_min=0;
|
|
|
|
|
for (int x=width1*D;x>=D;x-=D)
|
|
|
|
|
{
|
|
|
|
|
accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
|
|
|
|
|
|
|
|
|
|
if(uniquenessRatio>0)
|
|
|
|
|
{
|
|
|
|
|
#if CV_SIMD128
|
|
|
|
|
horPassCostVolume+=x;
|
|
|
|
|
int thresh = (100*min_cost)/(100-uniquenessRatio);
|
|
|
|
|
v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
|
|
|
|
|
v_int16x8 d1 = v_setall_s16((short)(best_d-1));
|
|
|
|
|
v_int16x8 d2 = v_setall_s16((short)(best_d+1));
|
|
|
|
|
v_int16x8 eight_reg = v_setall_s16(8);
|
|
|
|
|
v_int16x8 cur_d(0,1,2,3,4,5,6,7);
|
|
|
|
|
v_int16x8 mask,cost1,cost2;
|
|
|
|
|
|
|
|
|
|
for( d = 0; d < D; d+=16 )
|
|
|
|
|
{
|
|
|
|
|
cost1 = v_load_aligned(horPassCostVolume+d);
|
|
|
|
|
cost2 = v_load_aligned(horPassCostVolume+d+8);
|
|
|
|
|
|
|
|
|
|
mask = cost1 < thresh_reg;
|
|
|
|
|
mask = mask & ( (cur_d<d1) | (cur_d>d2) );
|
|
|
|
|
if( v_signmask(mask) )
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
cur_d = cur_d+eight_reg;
|
|
|
|
|
|
|
|
|
|
mask = cost2 < thresh_reg;
|
|
|
|
|
mask = mask & ( (cur_d<d1) | (cur_d>d2) );
|
|
|
|
|
if( v_signmask(mask) )
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
cur_d = cur_d+eight_reg;
|
|
|
|
|
}
|
|
|
|
|
horPassCostVolume-=x;
|
|
|
|
|
#else
|
|
|
|
|
for( d = 0; d < D; d++ )
|
|
|
|
|
{
|
|
|
|
|
if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
if( d < D )
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
d = best_d;
|
|
|
|
|
|
|
|
|
|
int _x2 = x/D - 1 + minX1 - d - minD;
|
|
|
|
|
if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
|
|
|
|
|
{
|
|
|
|
|
disp2CostBuf[_x2] = min_cost;
|
|
|
|
|
disp2Buf[_x2] = (short)(d + minD);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if( 0 < d && d < D-1 )
|
|
|
|
|
{
|
|
|
|
|
// do subpixel quadratic interpolation:
|
|
|
|
|
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
|
|
|
|
|
// then find minimum of the parabola.
|
|
|
|
|
int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
|
|
|
|
|
d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
d *= DISP_SCALE;
|
|
|
|
|
|
|
|
|
|
disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for(int x = minX1; x < maxX1; x++ )
|
|
|
|
|
{
|
|
|
|
|
// pseudo LRC consistency check using only one disparity map;
|
|
|
|
|
// pixels with difference more than disp12MaxDiff are invalidated
|
|
|
|
|
int d1 = disp_row[x];
|
|
|
|
|
if( d1 == INVALID_DISP_SCALED )
|
|
|
|
|
continue;
|
|
|
|
|
int _d = d1 >> StereoMatcher::DISP_SHIFT;
|
|
|
|
|
int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
|
|
|
|
|
int _x = x - _d, x_ = x - d_;
|
|
|
|
|
if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
|
|
|
|
|
0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
|
|
|
|
|
disp_row[x] = (short)INVALID_DISP_SCALED;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
|
|
|
|
|
Mat& disp1, const StereoSGBMParams& params,
|
|
|
|
|
Mat* buffers, int nstripes )
|
|
|
|
|
{
|
|
|
|
|
// precompute a lookup table for the raw matching cost computation:
|
|
|
|
|
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
|
|
|
|
|
PixType* clipTab = new PixType[TAB_SIZE];
|
|
|
|
|
int ftzero = std::max(params.preFilterCap, 15) | 1;
|
|
|
|
|
for(int k = 0; k < TAB_SIZE; k++ )
|
|
|
|
|
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
|
|
|
|
|
|
|
|
|
|
// allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
|
|
|
|
|
int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
|
|
|
|
|
int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
|
|
|
|
|
Mat* dst_disp = new Mat[nstripes];
|
|
|
|
|
for(int i=0;i<nstripes;i++)
|
|
|
|
|
dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
|
|
|
|
|
|
|
|
|
|
parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
|
|
|
|
|
|
|
|
|
|
//assemble disp1 from dst_disp:
|
|
|
|
|
short* dst_row;
|
|
|
|
|
short* src_row;
|
|
|
|
|
for(int i=0;i<disp1.rows;i++)
|
|
|
|
|
{
|
|
|
|
|
dst_row = (short*)disp1.ptr(i);
|
|
|
|
|
src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
|
|
|
|
|
memcpy(dst_row,src_row,disp1.cols*sizeof(short));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
delete[] clipTab;
|
|
|
|
|
delete[] dst_disp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class StereoSGBMImpl : public StereoSGBM
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
@@ -857,7 +1489,11 @@ public:
|
|
|
|
|
disparr.create( left.size(), CV_16S );
|
|
|
|
|
Mat disp = disparr.getMat();
|
|
|
|
|
|
|
|
|
|
if(params.mode==MODE_SGBM_3WAY)
|
|
|
|
|
computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
|
|
|
|
|
else
|
|
|
|
|
computeDisparitySGBM( left, right, disp, params, buffer );
|
|
|
|
|
|
|
|
|
|
medianBlur(disp, disp, 3);
|
|
|
|
|
|
|
|
|
|
if( params.speckleWindowSize > 0 )
|
|
|
|
@@ -933,6 +1569,12 @@ public:
|
|
|
|
|
|
|
|
|
|
StereoSGBMParams params;
|
|
|
|
|
Mat buffer;
|
|
|
|
|
|
|
|
|
|
// the number of stripes is fixed, disregarding the number of threads/processors
|
|
|
|
|
// to make the results fully reproducible:
|
|
|
|
|
static const int num_stripes = 4;
|
|
|
|
|
Mat buffers[num_stripes];
|
|
|
|
|
|
|
|
|
|
static const char* name_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|