From b782d8bb53d180f20f90b16b123637463e33d879 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 24 Jul 2012 17:24:31 +0400 Subject: [PATCH] integrated patch with some SSE2/SSE4.2 optimizations from Grigory Frolov --- .../core/include/opencv2/core/internal.hpp | 21 ++++++- modules/core/src/convert.cpp | 58 ++++++++++++++++- modules/core/src/copy.cpp | 62 ++++++++++++++++++- modules/core/src/lapack.cpp | 51 ++++++++++++++- modules/core/src/precomp.hpp | 2 + modules/core/src/stat.cpp | 32 +++++++++- modules/core/src/system.cpp | 4 +- 7 files changed, 220 insertions(+), 10 deletions(-) diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index 8a66e1135..369921aff 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -120,12 +120,27 @@ CV_INLINE IppiSize ippiSize(int width, int height) # else # define CV_SSSE3 0 # endif -#else +# if defined __SSE4_1__ || _MSC_VER >= 1600 +# include +# define CV_SSE4_1 1 +# endif +# if defined __SSE4_2__ || _MSC_VER >= 1600 +# include +# define CV_SSE4_2 1 +# endif +# if defined __AVX__ || _MSC_VER >= 1600 +# include +# define CV_AVX 1 +# endif +# else # define CV_SSE 0 # define CV_SSE2 0 # define CV_SSE3 0 # define CV_SSSE3 0 -#endif +# define CV_SSE4_1 0 +# define CV_SSE4_2 0 +# define CV_AVX 0 +# endif #if defined ANDROID && defined __ARM_NEON__ # include "arm_neon.h" @@ -764,4 +779,4 @@ CV_EXPORTS bool icvCheckGlError(const char* file, const int line, const char* fu #endif //__cplusplus -#endif // __OPENCV_CORE_INTERNAL_HPP__ +#endif // __OPENCV_CORE_INTERNAL_HPP__ \ No newline at end of file diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 98370eaa7..1f6a85d87 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -657,6 +657,62 @@ cvtScale_( const short* src, size_t sstep, } } +template<> void +cvtScale_( const short* src, size_t sstep, + int* dst, size_t dstep, Size size, + float scale, float shift ) +{ + sstep /= sizeof(src[0]); + dstep /= sizeof(dst[0]); + + for( ; size.height--; src += sstep, dst += dstep ) + { + int x = 0; + + #if CV_SSE2 + if(USE_SSE2)//~5X + { + __m128 scale128 = _mm_set1_ps (scale); + __m128 shift128 = _mm_set1_ps (shift); + for(; x <= size.width - 8; x += 8 ) + { + __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); + __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); + __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); + __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); + rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); + rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); + r0 = _mm_cvtps_epi32(rf0); + r1 = _mm_cvtps_epi32(rf1); + + _mm_storeu_si128((__m128i*)(dst + x), r0); + _mm_storeu_si128((__m128i*)(dst + x + 4), r1); + } + } + #endif + + //We will wait Haswell + /* + #if CV_AVX + if(USE_AVX)//2X - bad variant + { + ////TODO:AVX implementation (optimization?) required + __m256 scale256 = _mm256_set1_ps (scale); + __m256 shift256 = _mm256_set1_ps (shift); + for(; x <= size.width - 8; x += 8 ) + { + __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); + __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); + __m256i res = _mm256_cvtps_epi32(r0); + _mm256_storeu_si256 ((__m256i*)(dst+x), res); + } + } + #endif*/ + + for(; x < size.width; x++ ) + dst[x] = saturate_cast(src[x]*scale + shift); + } +} template static void cvt_( const T* src, size_t sstep, @@ -1305,4 +1361,4 @@ CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr, cv::normalize( src, dst, a, b, norm_type, dst.type(), mask ); } -/* End of file. */ +/* End of file. */ \ No newline at end of file diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 24e6a5118..6c33db54a 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -78,6 +78,66 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha } } +template<> static void +copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) +{ + for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) + { + const uchar* src = (const uchar*)_src; + uchar* dst = (uchar*)_dst; + int x = 0; + #if CV_SSE4_2 + if(USE_SSE4_2)// + { + __m128i zero = _mm_setzero_si128 (); + + for( ; x <= size.width - 16; x += 16 ) + { + const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x)); + __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); + __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); + __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); + rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); + _mm_storeu_si128((__m128i*)(dst + x), rDst); + } + } + #endif + for( ; x < size.width; x++ ) + if( mask[x] ) + dst[x] = src[x]; + } +} + +template<> static void +copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) +{ + for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) + { + const ushort* src = (const ushort*)_src; + ushort* dst = (ushort*)_dst; + int x = 0; + #if CV_SSE4_2 + if(USE_SSE4_2)// + { + __m128i zero = _mm_setzero_si128 (); + for( ; x <= size.width - 8; x += 8 ) + { + const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x)); + __m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x)); + _mask = _mm_unpacklo_epi8(_mask, _mask); + __m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x)); + __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); + rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); + _mm_storeu_si128((__m128i*)(dst + x), rDst); + } + } + #endif + for( ; x < size.width; x++ ) + if( mask[x] ) + dst[x] = src[x]; + } +} + static void copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz) { @@ -567,4 +627,4 @@ cvRepeat( const CvArr* srcarr, CvArr* dstarr ) cv::repeat(src, dst.rows/src.rows, dst.cols/src.cols, dst); } -/* End of file. */ +/* End of file. */ \ No newline at end of file diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index c66923d7e..74c6edd3b 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -1010,6 +1010,25 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) if( type == CV_32FC1 ) { double d = det2(Sf); + #if CV_SSE4_2 + if(USE_SSE4_2) + { + __m128 zero = _mm_setzero_ps(); + __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1) + __m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1) + __m128 s0 = _mm_blend_ps(t0,t1,12); + d = 1./d; + result = true; + __m128 det =_mm_set1_ps((float)d); + s0 = _mm_mul_ps(s0, det); + const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0}; + __m128 pattern = _mm_load_ps((const float*)inv); + s0 = _mm_xor_ps(s0, pattern);//==-1*s0 + s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3)); + _mm_storel_pi((__m64*)dstdata, s0); + _mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0); + } + #else if( d != 0. ) { double t0, t1; @@ -1022,12 +1041,36 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) t0 = -Sf(0,1)*d; t1 = -Sf(1,0)*d; Df(0,1) = (float)t0; - Df(1,0) = (float)t1; + Df(1,0) = (float)t1; } + #endif } else { - double d = det2(Sd); + double d = det2(Sd); + #if CV_SSE2 + if(USE_SSE2) + { + __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1) + __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1) + __m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal + __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal + result = true; + d = 1./d; + __m128d det = _mm_load1_pd((const double*)&d); + sm = _mm_mul_pd(sm, det); + //__m128d pattern = _mm_set1_pd(-1.); + static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80}; + __m128d pattern = _mm_load1_pd((double*)inv); + ss = _mm_mul_pd(ss, det); + ss = _mm_xor_pd(ss, pattern);//==-1*ss + //ss = _mm_mul_pd(ss,pattern); + s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1)); + s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1)); + _mm_store_pd((double*)dstdata, s0); + _mm_store_pd((double*)(dstdata+dststep), s1); + } + #else if( d != 0. ) { double t0, t1; @@ -1042,6 +1085,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) Dd(0,1) = t0; Dd(1,0) = t1; } + #endif } } else if( n == 3 ) @@ -1148,6 +1192,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) return result; } + /****************************************************************************************\ * Solving a linear system * \****************************************************************************************/ @@ -1797,4 +1842,4 @@ cvSVBkSb( const CvArr* warr, const CvArr* uarr, cv::SVD::backSubst(w, u, v, rhs, dst); CV_Assert( dst.data == dst0.data ); -} +} \ No newline at end of file diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 77bc4c34a..81b9d6e80 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -170,6 +170,8 @@ struct NoVec }; extern volatile bool USE_SSE2; +extern volatile bool USE_SSE4_2; +extern volatile bool USE_AVX; enum { BLOCK_SIZE = 1024 }; diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index b5b08fb54..3626a2a67 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -221,6 +221,36 @@ static int countNonZero_(const T* src, int len ) return nz; } +template <> +int countNonZero_ (const uchar* src, int len) +{ + int i=0, nz = 0; + #if CV_SSE4_2 + if(USE_SSE4_2)//5x-6x + { + __m128i pattern = _mm_setzero_si128 (); + __m128i inv = _mm_set1_epi8((char)1); + __int64 CV_DECL_ALIGNED(16) buf[2]; + for (; i<=len-16; i+=16) + { + __m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i)); + __m128i res = _mm_cmpeq_epi8(r0, pattern); + res = _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001 + _mm_store_si128 ((__m128i*)buf, res); + + __int64 countLow = _mm_popcnt_u64(buf[0]); + nz += countLow; + + __int64 countHigh = _mm_popcnt_u64(buf[1]); + nz +=countHigh; + } + } + #endif + for( ; i < len; i++ ) + nz += src[i] != 0; + return nz; +} + static int countNonZero8u( const uchar* src, int len ) { return countNonZero_(src, len); } @@ -1982,4 +2012,4 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr ) cv::extractImageCOI(imgB, b); return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask); -} +} \ No newline at end of file diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index fc4dceda6..b8a46611b 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -205,6 +205,8 @@ IPPInitializer ippInitializer; #endif volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; +volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; +volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; void setUseOptimized( bool flag ) { @@ -921,4 +923,4 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID ) } #endif -/* End of file. */ +/* End of file. */ \ No newline at end of file