From 551b5d3e1af5891959d797f947c850dbe9055ccd Mon Sep 17 00:00:00 2001 From: Kai Hugo Hustoft Endresen Date: Fri, 8 Jan 2016 00:32:52 +0100 Subject: [PATCH] StereoSGBM.cpp - use SSE2 for pass 2 using MODE_HH With a test image set of 2800x1400 bytes on a Intel Core i7 5960X this improves runtime of MODE_HH with about 10%. (this particular replaced code segment is approx 3 times faster than the non-SSE2 variant). I was able to reduce runtime by 130 ms by this simple fix. The second part of the SSE2 optimized part could probably be optimized further by using shift SSE2 operations, but I imagine this would improve performance 10-20 ms at best. --- modules/calib3d/src/stereosgbm.cpp | 50 +++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index 1c085f9b3..9b783e9d6 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -759,14 +759,50 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, } else { - for( d = 0; d < D; d++ ) + #if CV_SSE2 + if( useSIMD ) { - int Sval = Sp[d]; - if( Sval < minS ) - { - minS = Sval; - bestDisp = d; - } + __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1); + __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8); + + for( d = 0; d < D; d+= 8 ) + { + __m128i L0 = _mm_load_si128((const __m128i*)( Sp + d )); + __m128i mask = _mm_cmplt_epi16( L0, _minS ); + _minS = _mm_min_epi16( L0, _minS ); + _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128( _bestDisp, _d8), mask)); + _d8 = _mm_adds_epi16(_d8, _8 ); + } + short CV_DECL_ALIGNED(16) bestDispBuf[8]; + _mm_store_si128((__m128i*)bestDispBuf, _bestDisp); + short CV_DECL_ALIGNED(16) minSBuf[8]; + _mm_store_si128((__m128i*)minSBuf, _minS ); + + for( int i = 0; i < 8; i++ ) + { + int Sval = minSBuf[ i ]; + if( Sval <= minS ) + { + if( ( Sval < minS ) || ( bestDispBuf[i] < bestDisp ) ) + { + bestDisp = bestDispBuf[i]; + } + minS = Sval; + } + } + } + else + #endif + { + for( d = 0; d < D; d++ ) + { + int Sval = Sp[d]; + if( Sval < minS ) + { + minS = Sval; + bestDisp = d; + } + } } }